From 697755e6e412b3a041df87718bb5e79d441be910 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 11:15:34 -0400 Subject: [PATCH 01/35] Add NVRTC support to precision tools --- include/boost/math/tools/precision.hpp | 159 ++++++++++--------------- 1 file changed, 65 insertions(+), 94 deletions(-) diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp index b181a2c4a..0055d9028 100644 --- a/include/boost/math/tools/precision.hpp +++ b/include/boost/math/tools/precision.hpp @@ -15,12 +15,15 @@ #include #include #include + +#ifndef BOOST_MATH_HAS_NVRTC #include #include #include #include #include #include // LDBL_MANT_DIG +#endif namespace boost{ namespace math { @@ -41,62 +44,30 @@ namespace tools template inline constexpr int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) noexcept { - static_assert( ::std::numeric_limits::is_specialized, "Type T must be specialized"); - static_assert( ::std::numeric_limits::radix == 2 || ::std::numeric_limits::radix == 10, "Type T must have a radix of 2 or 10"); + static_assert( ::boost::math::numeric_limits::is_specialized, "Type T must be specialized"); + static_assert( ::boost::math::numeric_limits::radix == 2 || ::boost::math::numeric_limits::radix == 10, "Type T must have a radix of 2 or 10"); - return std::numeric_limits::radix == 2 - ? std::numeric_limits::digits - : ((std::numeric_limits::digits + 1) * 1000L) / 301L; + return boost::math::numeric_limits::radix == 2 + ? boost::math::numeric_limits::digits + : ((boost::math::numeric_limits::digits + 1) * 1000L) / 301L; } template -inline constexpr T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +inline constexpr T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { - static_assert( ::std::numeric_limits::is_specialized, "Type T must be specialized"); - return (std::numeric_limits::max)(); + static_assert( ::boost::math::numeric_limits::is_specialized, "Type T must be specialized"); + return (boost::math::numeric_limits::max)(); } // Also used as a finite 'infinite' value for - and +infinity, for example: // -max_value = -1.79769e+308, max_value = 1.79769e+308. -#ifdef BOOST_MATH_HAS_GPU_SUPPORT - -template <> -BOOST_MATH_GPU_ENABLED constexpr float max_value() noexcept -{ - return FLT_MAX; -} - -template <> -BOOST_MATH_GPU_ENABLED constexpr double max_value() noexcept -{ - return DBL_MAX; -} - -#endif - template -inline constexpr T min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +inline constexpr T min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { - static_assert( ::std::numeric_limits::is_specialized, "Type T must be specialized"); - - return (std::numeric_limits::min)(); -} + static_assert( ::boost::math::numeric_limits::is_specialized, "Type T must be specialized"); -#ifdef BOOST_MATH_HAS_GPU_SUPPORT - -template <> -BOOST_MATH_GPU_ENABLED constexpr float min_value() noexcept -{ - return FLT_MIN; + return (boost::math::numeric_limits::min)(); } -template <> -BOOST_MATH_GPU_ENABLED constexpr double min_value() noexcept -{ - return DBL_MIN; -} - -#endif - namespace detail{ // // Logarithmic limits come next, note that although @@ -107,13 +78,13 @@ namespace detail{ // For type float first: // template -BOOST_MATH_GPU_ENABLED constexpr T log_max_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED constexpr T log_max_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return 88.0f; } template -BOOST_MATH_GPU_ENABLED constexpr T log_min_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED constexpr T log_min_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return -87.0f; } @@ -121,13 +92,13 @@ BOOST_MATH_GPU_ENABLED constexpr T log_min_value(const std::integral_constant -BOOST_MATH_GPU_ENABLED constexpr T log_max_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED constexpr T log_max_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return 709.0; } template -BOOST_MATH_GPU_ENABLED constexpr T log_min_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED constexpr T log_min_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return -708.0; } @@ -135,19 +106,19 @@ BOOST_MATH_GPU_ENABLED constexpr T log_min_value(const std::integral_constant -BOOST_MATH_GPU_ENABLED inline constexpr T log_max_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T log_max_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return 11356.0L; } template -BOOST_MATH_GPU_ENABLED inline constexpr T log_min_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T log_min_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return -11355.0L; } template -BOOST_MATH_GPU_ENABLED inline T log_max_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) +BOOST_MATH_GPU_ENABLED inline T log_max_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) { BOOST_MATH_STD_USING #ifdef __SUNPRO_CC @@ -160,7 +131,7 @@ BOOST_MATH_GPU_ENABLED inline T log_max_value(const std::integral_constant -BOOST_MATH_GPU_ENABLED inline T log_min_value(const std::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) +BOOST_MATH_GPU_ENABLED inline T log_min_value(const boost::math::integral_constant& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) { BOOST_MATH_STD_USING #ifdef __SUNPRO_CC @@ -173,21 +144,21 @@ BOOST_MATH_GPU_ENABLED inline T log_min_value(const std::integral_constant -constexpr T epsilon(const std::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(std::is_floating_point::value) +constexpr T epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { - return std::numeric_limits::epsilon(); + return boost::math::numeric_limits::epsilon(); } #ifdef BOOST_MATH_HAS_GPU_SUPPORT template <> -BOOST_MATH_GPU_ENABLED constexpr float epsilon(const std::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(float)) noexcept(true) +BOOST_MATH_GPU_ENABLED constexpr float epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(float)) noexcept(true) { return FLT_EPSILON; } template <> -BOOST_MATH_GPU_ENABLED constexpr double epsilon(const std::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(double)) noexcept(true) +BOOST_MATH_GPU_ENABLED constexpr double epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(double)) noexcept(true) { return DBL_EPSILON; } @@ -196,7 +167,7 @@ BOOST_MATH_GPU_ENABLED constexpr double epsilon(const std::true_type& BOOST_MATH #if defined(__GNUC__) && ((LDBL_MANT_DIG == 106) || (__LDBL_MANT_DIG__ == 106)) template <> -inline constexpr long double epsilon(const std::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(long double)) noexcept(std::is_floating_point::value) +inline constexpr long double epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(long double)) noexcept(boost::math::is_floating_point::value) { // numeric_limits on Darwin (and elsewhere) tells lies here: // the issue is that long double on a few platforms is @@ -215,7 +186,7 @@ inline constexpr long double epsilon(const std::true_type& BOOST_MA #endif template -inline T epsilon(const std::false_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) +inline T epsilon(const boost::math::false_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) { // Note: don't cache result as precision may vary at runtime: BOOST_MATH_STD_USING // for ADL of std names @@ -225,23 +196,23 @@ inline T epsilon(const std::false_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE template struct log_limit_traits { - typedef typename std::conditional< - (std::numeric_limits::radix == 2) && - (std::numeric_limits::max_exponent == 128 - || std::numeric_limits::max_exponent == 1024 - || std::numeric_limits::max_exponent == 16384), - std::integral_constant::max_exponent > INT_MAX ? INT_MAX : static_cast(std::numeric_limits::max_exponent))>, - std::integral_constant + typedef typename boost::math::conditional< + (boost::math::numeric_limits::radix == 2) && + (boost::math::numeric_limits::max_exponent == 128 + || boost::math::numeric_limits::max_exponent == 1024 + || boost::math::numeric_limits::max_exponent == 16384), + boost::math::integral_constant::max_exponent > (boost::math::numeric_limits::max)() ? (boost::math::numeric_limits::max)() : static_cast(boost::math::numeric_limits::max_exponent))>, + boost::math::integral_constant >::type tag_type; static constexpr bool value = (tag_type::value != 0); - static_assert(::std::numeric_limits::is_specialized || !value, "Type T must be specialized or equal to 0"); + static_assert(::boost::math::numeric_limits::is_specialized || !value, "Type T must be specialized or equal to 0"); }; template struct log_limit_noexcept_traits_imp : public log_limit_traits {}; -template struct log_limit_noexcept_traits_imp : public std::integral_constant {}; +template struct log_limit_noexcept_traits_imp : public boost::math::integral_constant {}; template -struct log_limit_noexcept_traits : public log_limit_noexcept_traits_imp::value> {}; +struct log_limit_noexcept_traits : public log_limit_noexcept_traits_imp::value> {}; } // namespace detail @@ -256,9 +227,9 @@ BOOST_MATH_GPU_ENABLED inline constexpr T log_max_value(BOOST_MATH_EXPLICIT_TEMP #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS return detail::log_max_value(typename detail::log_limit_traits::tag_type()); #else - BOOST_MATH_ASSERT(::std::numeric_limits::is_specialized); + BOOST_MATH_ASSERT(::boost::math::numeric_limits::is_specialized); BOOST_MATH_STD_USING - static const T val = log((std::numeric_limits::max)()); + static const T val = log((boost::math::numeric_limits::max)()); return val; #endif } @@ -269,9 +240,9 @@ BOOST_MATH_GPU_ENABLED inline constexpr T log_min_value(BOOST_MATH_EXPLICIT_TEMP #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS return detail::log_min_value(typename detail::log_limit_traits::tag_type()); #else - BOOST_MATH_ASSERT(::std::numeric_limits::is_specialized); + BOOST_MATH_ASSERT(::boost::math::numeric_limits::is_specialized); BOOST_MATH_STD_USING - static const T val = log((std::numeric_limits::min)()); + static const T val = log((boost::math::numeric_limits::min)()); return val; #endif } @@ -284,11 +255,11 @@ template BOOST_MATH_GPU_ENABLED constexpr T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) noexcept(boost::math::is_floating_point::value) { #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS - return detail::epsilon(std::integral_constant::is_specialized>()); + return detail::epsilon(boost::math::integral_constant::is_specialized>()); #else - return ::std::numeric_limits::is_specialized ? - detail::epsilon(std::true_type()) : - detail::epsilon(std::false_type()); + return ::boost::math::numeric_limits::is_specialized ? + detail::epsilon(boost::math::true_type()) : + detail::epsilon(boost::math::false_type()); #endif } @@ -318,25 +289,25 @@ BOOST_MATH_GPU_ENABLED constexpr long double epsilon(BOOST_MATH_EXPLICIT_TEMPLAT namespace detail{ template -BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.00034526698300124390839884978618400831996329879769945L); } template -BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.1490116119384765625e-7L); } template -inline constexpr T root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.32927225399135962333569506281281311031656150598474e-9L); } template -inline constexpr T root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.1387778780781445675529539585113525390625e-16L); } @@ -350,32 +321,32 @@ inline T root_epsilon_imp(const T*, const Tag&) } template -inline T root_epsilon_imp(const T*, const std::integral_constant&) +inline T root_epsilon_imp(const T*, const boost::math::integral_constant&) { BOOST_MATH_STD_USING return sqrt(tools::epsilon()); } template -BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.0049215666011518482998719164346805794944150447839903L); } template -BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(6.05545445239333906078989272793696693569753008995e-6L); } template -inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(4.76837158203125e-7L); } template -inline constexpr T cbrt_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(5.7749313854154005630396773604745549542403508090496e-12L); } @@ -389,32 +360,32 @@ inline T cbrt_epsilon_imp(const T*, const Tag&) } template -inline T cbrt_epsilon_imp(const T*, const std::integral_constant&) +inline T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) { BOOST_MATH_STD_USING; return pow(tools::epsilon(), T(1) / 3); } template -BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.018581361171917516667460937040007436176452688944747L); } template -BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.0001220703125L); } template -inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.18145860519450699870567321328132261891067079047605e-4L); } template -inline constexpr T forth_root_epsilon_imp(const T*, const std::integral_constant&) noexcept(std::is_floating_point::value) +inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.37252902984619140625e-8L); } @@ -428,7 +399,7 @@ inline T forth_root_epsilon_imp(const T*, const Tag&) } template -inline T forth_root_epsilon_imp(const T*, const std::integral_constant&) +inline T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) { BOOST_MATH_STD_USING return sqrt(sqrt(tools::epsilon())); @@ -437,26 +408,26 @@ inline T forth_root_epsilon_imp(const T*, const std::integral_constant&) template struct root_epsilon_traits { - typedef std::integral_constant::radix == 2) && (::std::numeric_limits::digits != INT_MAX) ? std::numeric_limits::digits : 0> tag_type; + typedef boost::math::integral_constant::radix == 2) && (::boost::math::numeric_limits::digits != (boost::math::numeric_limits::max)()) ? boost::math::numeric_limits::digits : 0> tag_type; static constexpr bool has_noexcept = (tag_type::value == 113) || (tag_type::value == 64) || (tag_type::value == 53) || (tag_type::value == 24); }; } template -BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon() noexcept(boost::math::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) { return detail::root_epsilon_imp(static_cast(nullptr), typename detail::root_epsilon_traits::tag_type()); } template -BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon() noexcept(boost::math::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) { return detail::cbrt_epsilon_imp(static_cast(nullptr), typename detail::root_epsilon_traits::tag_type()); } template -BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon() noexcept(std::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon() noexcept(boost::math::is_floating_point::value && detail::root_epsilon_traits::has_noexcept) { return detail::forth_root_epsilon_imp(static_cast(nullptr), typename detail::root_epsilon_traits::tag_type()); } From 4aeb9c1b8c8846bddb915bb602d5a8a39bba935e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 11:51:23 -0400 Subject: [PATCH 02/35] Fix missing header in error handling --- include/boost/math/policies/error_handling.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/policies/error_handling.hpp b/include/boost/math/policies/error_handling.hpp index d1d74c534..07e112721 100644 --- a/include/boost/math/policies/error_handling.hpp +++ b/include/boost/math/policies/error_handling.hpp @@ -10,6 +10,7 @@ #include #include +#include #ifndef BOOST_MATH_HAS_NVRTC From 14694c42264ea49b2b32b7e0b2dca44acedc4d24 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 11:52:00 -0400 Subject: [PATCH 03/35] Add minimal implantation of pair for NVRTC platform --- include/boost/math/tools/tuple.hpp | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp index f132b97f5..fba37fb4f 100644 --- a/include/boost/math/tools/tuple.hpp +++ b/include/boost/math/tools/tuple.hpp @@ -9,7 +9,32 @@ #include -#ifdef BOOST_MATH_ENABLE_CUDA +#ifdef BOOST_MATH_HAS_NVRTC + +namespace boost { +namespace math { + +template +class pair +{ +public: + T first; + U second; + + BOOST_MATH_GPU_ENABLED pair(T x, U y) noexcept : first {x}, second {y} {} +}; + +template +BOOST_MATH_GPU_ENABLED auto make_pair(T x, U y) noexcept +{ + return pair(x, y); +} + +} // namespace math +} // namespace boost + + +#elif defined(BOOST_MATH_ENABLE_CUDA) #include #include From ec5b2a96df35f89de0f507207609d9f2dfe66e43 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 11:52:15 -0400 Subject: [PATCH 04/35] Add NVRTC support to derived accessors --- .../boost/math/distributions/detail/derived_accessors.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/boost/math/distributions/detail/derived_accessors.hpp b/include/boost/math/distributions/detail/derived_accessors.hpp index 9d39f2d59..90679ef21 100644 --- a/include/boost/math/distributions/detail/derived_accessors.hpp +++ b/include/boost/math/distributions/detail/derived_accessors.hpp @@ -28,9 +28,12 @@ // can find the definitions referred to herein. // -#include -#include #include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#endif #ifdef _MSC_VER # pragma warning(push) From a6d14dd6e2dcf134f51702cb6bd8ee33c0befc81 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 11:52:32 -0400 Subject: [PATCH 05/35] Add NVRTC support to the arcsine dist --- include/boost/math/distributions/arcsine.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/include/boost/math/distributions/arcsine.hpp b/include/boost/math/distributions/arcsine.hpp index d921e1145..899bfb1b2 100644 --- a/include/boost/math/distributions/arcsine.hpp +++ b/include/boost/math/distributions/arcsine.hpp @@ -30,15 +30,21 @@ #ifndef BOOST_MATH_DIST_ARCSINE_HPP #define BOOST_MATH_DIST_ARCSINE_HPP -#include #include #include -#include #include // complements. #include // error checks. #include - #include // isnan. +#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#include +#include +#include // For std::domain_error. +#endif #if defined (BOOST_MSVC) # pragma warning(push) @@ -46,9 +52,6 @@ // in domain_error_imp in error_handling. #endif -#include -#include // For std::domain_error. - namespace boost { namespace math From 2356c73759d000a1baff1fbbefa6b7c6876522b9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 12:01:23 -0400 Subject: [PATCH 06/35] Undef BOOST_FPU_EXCEPTION_GUARD if exists and give a new empty def --- include/boost/math/tools/config.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 4c8e3dcf3..d886e420d 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -799,6 +799,13 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING +// This should be defined to nothing but since it is not specifically a math macro +// we need to undef before proceeding +#ifdef BOOST_FPU_EXCEPTION_GUARD +# undef BOOST_FPU_EXCEPTION_GUARD +# define BOOST_FPU_EXCEPTION_GUARD +#endif + template BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; b = t; } From 791e9af4f3ca563bbffdc6d6d14d81f235dde554 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 12:01:34 -0400 Subject: [PATCH 07/35] Add testing of the arcsine dist --- test/nvrtc_jamfile | 8 + test/test_arcsine_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_arcsine_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++ test/test_arcsine_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_arcsine_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++ test/test_arcsine_quantile_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_arcsine_quantile_nvrtc_float.cpp | 191 ++++++++++++++++++++ 7 files changed, 1154 insertions(+) create mode 100644 test/test_arcsine_cdf_nvrtc_double.cpp create mode 100644 test/test_arcsine_cdf_nvrtc_float.cpp create mode 100644 test/test_arcsine_pdf_nvrtc_double.cpp create mode 100644 test/test_arcsine_pdf_nvrtc_float.cpp create mode 100644 test/test_arcsine_quantile_nvrtc_double.cpp create mode 100644 test/test_arcsine_quantile_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 434ca002b..325dcfb14 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -9,6 +9,14 @@ project : requirements [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; +# Distributions +run test_arcsine_cdf_nvrtc_double.cpp ; +run test_arcsine_cdf_nvrtc_float.cpp ; +run test_arcsine_pdf_nvrtc_double.cpp ; +run test_arcsine_pdf_nvrtc_float.cpp ; +run test_arcsine_quantile_nvrtc_double.cpp ; +run test_arcsine_quantile_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_arcsine_cdf_nvrtc_double.cpp b/test/test_arcsine_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..814d7491a --- /dev/null +++ b/test/test_arcsine_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::arcsine_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::arcsine_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_arcsine_cdf_nvrtc_float.cpp b/test/test_arcsine_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..a159e608c --- /dev/null +++ b/test/test_arcsine_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::arcsine_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::arcsine_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_arcsine_pdf_nvrtc_double.cpp b/test/test_arcsine_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..dde875629 --- /dev/null +++ b/test/test_arcsine_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::arcsine_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::arcsine_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_arcsine_pdf_nvrtc_float.cpp b/test/test_arcsine_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..82b39c3cf --- /dev/null +++ b/test/test_arcsine_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::arcsine_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::arcsine_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_arcsine_quantile_nvrtc_double.cpp b/test/test_arcsine_quantile_nvrtc_double.cpp new file mode 100644 index 000000000..8a2d39ed5 --- /dev/null +++ b/test/test_arcsine_quantile_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::arcsine_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::arcsine_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_arcsine_quantile_nvrtc_float.cpp b/test/test_arcsine_quantile_nvrtc_float.cpp new file mode 100644 index 000000000..7491b7e11 --- /dev/null +++ b/test/test_arcsine_quantile_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::arcsine_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::arcsine_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 59c1dd4d8aa0122e5f5f8b5bea96955c3b7d08c3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 13:42:13 -0400 Subject: [PATCH 08/35] Fix FPU Guard macro --- include/boost/math/tools/config.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index d886e420d..a5c8b3fc9 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -803,9 +803,10 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur // we need to undef before proceeding #ifdef BOOST_FPU_EXCEPTION_GUARD # undef BOOST_FPU_EXCEPTION_GUARD -# define BOOST_FPU_EXCEPTION_GUARD #endif +#define BOOST_FPU_EXCEPTION_GUARD + template BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; b = t; } From dfcda439be3a289ffd2d158ec06badb98422380a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 13:42:30 -0400 Subject: [PATCH 09/35] Add NVRTC support to bernoulli dist --- include/boost/math/distributions/bernoulli.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/bernoulli.hpp b/include/boost/math/distributions/bernoulli.hpp index 639f9119c..f1c693f7f 100644 --- a/include/boost/math/distributions/bernoulli.hpp +++ b/include/boost/math/distributions/bernoulli.hpp @@ -28,15 +28,19 @@ #ifndef BOOST_MATH_SPECIAL_BERNOULLI_HPP #define BOOST_MATH_SPECIAL_BERNOULLI_HPP -#include #include #include #include #include // complements #include // error checks #include // isnan. +#include +#include +#ifndef BOOST_MATH_HAS_NVRTC #include +#include +#endif namespace boost { From ba549c51349dec9dd09700a3232d73fb0169797a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 13:42:45 -0400 Subject: [PATCH 10/35] Add bernoulli dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_bernoulli_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_bernoulli_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++ test/test_bernoulli_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_bernoulli_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++ test/test_bernoulli_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_bernoulli_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_bernoulli_cdf_nvrtc_double.cpp create mode 100644 test/test_bernoulli_cdf_nvrtc_float.cpp create mode 100644 test/test_bernoulli_pdf_nvrtc_double.cpp create mode 100644 test/test_bernoulli_pdf_nvrtc_float.cpp create mode 100644 test/test_bernoulli_quan_nvrtc_double.cpp create mode 100644 test/test_bernoulli_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 325dcfb14..10e7077d1 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -17,6 +17,13 @@ run test_arcsine_pdf_nvrtc_float.cpp ; run test_arcsine_quantile_nvrtc_double.cpp ; run test_arcsine_quantile_nvrtc_float.cpp ; +run test_bernoulli_cdf_nvrtc_double.cpp ; +run test_bernoulli_cdf_nvrtc_float.cpp ; +run test_bernoulli_pdf_nvrtc_double.cpp ; +run test_bernoulli_pdf_nvrtc_float.cpp ; +run test_bernoulli_quan_nvrtc_double.cpp ; +run test_bernoulli_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_bernoulli_cdf_nvrtc_double.cpp b/test/test_bernoulli_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..fc21b9ef6 --- /dev/null +++ b/test/test_bernoulli_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::bernoulli_distribution(), round(in1[i])); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::bernoulli_distribution(), round(h_in1[i])); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bernoulli_cdf_nvrtc_float.cpp b/test/test_bernoulli_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..d5112b6be --- /dev/null +++ b/test/test_bernoulli_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::bernoulli_distribution(), round(in1[i])); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::bernoulli_distribution(), round(h_in1[i])); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bernoulli_pdf_nvrtc_double.cpp b/test/test_bernoulli_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..f1b4bdf99 --- /dev/null +++ b/test/test_bernoulli_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::bernoulli_distribution(), round(in1[i])); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::bernoulli_distribution(), round(h_in1[i])); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bernoulli_pdf_nvrtc_float.cpp b/test/test_bernoulli_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..245945cde --- /dev/null +++ b/test/test_bernoulli_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::bernoulli_distribution(), round(in1[i])); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::bernoulli_distribution(), round(h_in1[i])); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bernoulli_quan_nvrtc_double.cpp b/test/test_bernoulli_quan_nvrtc_double.cpp new file mode 100644 index 000000000..63c8ae7fb --- /dev/null +++ b/test/test_bernoulli_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::bernoulli_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::bernoulli_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bernoulli_quan_nvrtc_float.cpp b/test/test_bernoulli_quan_nvrtc_float.cpp new file mode 100644 index 000000000..526438867 --- /dev/null +++ b/test/test_bernoulli_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::bernoulli_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_digamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::bernoulli_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From ed65c6e91bc4dbe0bfae624e47e865fa44610f69 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 14:39:14 -0400 Subject: [PATCH 11/35] Fix kernel naming --- test/test_arcsine_cdf_nvrtc_double.cpp | 8 ++++---- test/test_arcsine_cdf_nvrtc_float.cpp | 8 ++++---- test/test_arcsine_pdf_nvrtc_double.cpp | 8 ++++---- test/test_arcsine_pdf_nvrtc_float.cpp | 8 ++++---- test/test_arcsine_quantile_nvrtc_double.cpp | 8 ++++---- test/test_arcsine_quantile_nvrtc_float.cpp | 8 ++++---- test/test_bernoulli_cdf_nvrtc_double.cpp | 8 ++++---- test/test_bernoulli_cdf_nvrtc_float.cpp | 8 ++++---- test/test_bernoulli_pdf_nvrtc_double.cpp | 8 ++++---- test/test_bernoulli_pdf_nvrtc_float.cpp | 8 ++++---- test/test_bernoulli_quan_nvrtc_double.cpp | 8 ++++---- test/test_bernoulli_quan_nvrtc_float.cpp | 8 ++++---- 12 files changed, 48 insertions(+), 48 deletions(-) diff --git a/test/test_arcsine_cdf_nvrtc_double.cpp b/test/test_arcsine_cdf_nvrtc_double.cpp index 814d7491a..58b6b8297 100644 --- a/test/test_arcsine_cdf_nvrtc_double.cpp +++ b/test/test_arcsine_cdf_nvrtc_double.cpp @@ -28,7 +28,7 @@ typedef double float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_arcsine_cdf_nvrtc_float.cpp b/test/test_arcsine_cdf_nvrtc_float.cpp index a159e608c..3f8b04bb0 100644 --- a/test/test_arcsine_cdf_nvrtc_float.cpp +++ b/test/test_arcsine_cdf_nvrtc_float.cpp @@ -28,7 +28,7 @@ typedef float float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_arcsine_pdf_nvrtc_double.cpp b/test/test_arcsine_pdf_nvrtc_double.cpp index dde875629..c76b47f88 100644 --- a/test/test_arcsine_pdf_nvrtc_double.cpp +++ b/test/test_arcsine_pdf_nvrtc_double.cpp @@ -28,7 +28,7 @@ typedef double float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_arcsine_pdf_nvrtc_float.cpp b/test/test_arcsine_pdf_nvrtc_float.cpp index 82b39c3cf..e9145a162 100644 --- a/test/test_arcsine_pdf_nvrtc_float.cpp +++ b/test/test_arcsine_pdf_nvrtc_float.cpp @@ -28,7 +28,7 @@ typedef float float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_arcsine_quantile_nvrtc_double.cpp b/test/test_arcsine_quantile_nvrtc_double.cpp index 8a2d39ed5..ba8e2e5df 100644 --- a/test/test_arcsine_quantile_nvrtc_double.cpp +++ b/test/test_arcsine_quantile_nvrtc_double.cpp @@ -28,7 +28,7 @@ typedef double float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_arcsine_quantile_nvrtc_float.cpp b/test/test_arcsine_quantile_nvrtc_float.cpp index 7491b7e11..1fd2e4884 100644 --- a/test/test_arcsine_quantile_nvrtc_float.cpp +++ b/test/test_arcsine_quantile_nvrtc_float.cpp @@ -28,7 +28,7 @@ typedef float float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_bernoulli_cdf_nvrtc_double.cpp b/test/test_bernoulli_cdf_nvrtc_double.cpp index fc21b9ef6..887c3430a 100644 --- a/test/test_bernoulli_cdf_nvrtc_double.cpp +++ b/test/test_bernoulli_cdf_nvrtc_double.cpp @@ -28,7 +28,7 @@ typedef double float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_bernoulli_cdf_nvrtc_float.cpp b/test/test_bernoulli_cdf_nvrtc_float.cpp index d5112b6be..1a38c4348 100644 --- a/test/test_bernoulli_cdf_nvrtc_float.cpp +++ b/test/test_bernoulli_cdf_nvrtc_float.cpp @@ -28,7 +28,7 @@ typedef float float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_bernoulli_pdf_nvrtc_double.cpp b/test/test_bernoulli_pdf_nvrtc_double.cpp index f1b4bdf99..f3e21a095 100644 --- a/test/test_bernoulli_pdf_nvrtc_double.cpp +++ b/test/test_bernoulli_pdf_nvrtc_double.cpp @@ -28,7 +28,7 @@ typedef double float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_bernoulli_pdf_nvrtc_float.cpp b/test/test_bernoulli_pdf_nvrtc_float.cpp index 245945cde..bf9b76016 100644 --- a/test/test_bernoulli_pdf_nvrtc_float.cpp +++ b/test/test_bernoulli_pdf_nvrtc_float.cpp @@ -28,7 +28,7 @@ typedef float float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_bernoulli_quan_nvrtc_double.cpp b/test/test_bernoulli_quan_nvrtc_double.cpp index 63c8ae7fb..dcdd0e1f1 100644 --- a/test/test_bernoulli_quan_nvrtc_double.cpp +++ b/test/test_bernoulli_quan_nvrtc_double.cpp @@ -28,7 +28,7 @@ typedef double float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; diff --git a/test/test_bernoulli_quan_nvrtc_float.cpp b/test/test_bernoulli_quan_nvrtc_float.cpp index 526438867..24c492381 100644 --- a/test/test_bernoulli_quan_nvrtc_float.cpp +++ b/test/test_bernoulli_quan_nvrtc_float.cpp @@ -28,7 +28,7 @@ typedef float float_type; #include #include extern "C" __global__ -void test_digamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) @@ -83,10 +83,10 @@ int main() nvrtcProgram prog; nvrtcResult res; - res = nvrtcCreateProgram(&prog, cuda_kernel, "test_digamma_kernel.cu", 0, nullptr, nullptr); + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); checkNVRTCError(res, "Failed to create NVRTC program"); - nvrtcAddNameExpression(prog, "test_digamma_kernel"); + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); #ifdef BOOST_MATH_NVRTC_CI_RUN const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; @@ -117,7 +117,7 @@ int main() CUmodule module; CUfunction kernel; checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); - checkCUError(cuModuleGetFunction(&kernel, module, "test_digamma_kernel"), "Failed to get kernel function"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); int numElements = 5000; float_type *h_in1, *h_in2, *h_out; From 13ba4111de9fca43bee6c247f7935fc52eda0d77 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 14:39:38 -0400 Subject: [PATCH 12/35] Remove workarounds and markup functions that will never be taken anyway --- include/boost/math/tools/config.hpp | 1 + include/boost/math/tools/precision.hpp | 90 ++++++++------------------ 2 files changed, 29 insertions(+), 62 deletions(-) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index a5c8b3fc9..b67ce0ae9 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -795,6 +795,7 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur #define BOOST_MATH_NOEXCEPT(T) noexcept(boost::math::is_floating_point_v) #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T) +#define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T) #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast(V) #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp index 0055d9028..096a3265a 100644 --- a/include/boost/math/tools/precision.hpp +++ b/include/boost/math/tools/precision.hpp @@ -42,7 +42,7 @@ namespace tools // See Conceptual Requirements for Real Number Types. template -inline constexpr int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) noexcept +BOOST_MATH_GPU_ENABLED inline constexpr int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) noexcept { static_assert( ::boost::math::numeric_limits::is_specialized, "Type T must be specialized"); static_assert( ::boost::math::numeric_limits::radix == 2 || ::boost::math::numeric_limits::radix == 10, "Type T must have a radix of 2 or 10"); @@ -53,7 +53,7 @@ inline constexpr int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) noexcept } template -inline constexpr T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { static_assert( ::boost::math::numeric_limits::is_specialized, "Type T must be specialized"); return (boost::math::numeric_limits::max)(); @@ -61,7 +61,7 @@ inline constexpr T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boo // -max_value = -1.79769e+308, max_value = 1.79769e+308. template -inline constexpr T min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { static_assert( ::boost::math::numeric_limits::is_specialized, "Type T must be specialized"); @@ -144,30 +144,14 @@ BOOST_MATH_GPU_ENABLED inline T log_min_value(const boost::math::integral_consta } template -constexpr T epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED constexpr T epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(boost::math::is_floating_point::value) { return boost::math::numeric_limits::epsilon(); } -#ifdef BOOST_MATH_HAS_GPU_SUPPORT - -template <> -BOOST_MATH_GPU_ENABLED constexpr float epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(float)) noexcept(true) -{ - return FLT_EPSILON; -} - -template <> -BOOST_MATH_GPU_ENABLED constexpr double epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(double)) noexcept(true) -{ - return DBL_EPSILON; -} - -#endif - #if defined(__GNUC__) && ((LDBL_MANT_DIG == 106) || (__LDBL_MANT_DIG__ == 106)) template <> -inline constexpr long double epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(long double)) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr long double epsilon(const boost::math::true_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(long double)) noexcept(boost::math::is_floating_point::value) { // numeric_limits on Darwin (and elsewhere) tells lies here: // the issue is that long double on a few platforms is @@ -186,7 +170,7 @@ inline constexpr long double epsilon(const boost::math::true_type& #endif template -inline T epsilon(const boost::math::false_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) +BOOST_MATH_GPU_ENABLED inline T epsilon(const boost::math::false_type& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) { // Note: don't cache result as precision may vary at runtime: BOOST_MATH_STD_USING // for ADL of std names @@ -254,38 +238,20 @@ BOOST_MATH_GPU_ENABLED inline constexpr T log_min_value(BOOST_MATH_EXPLICIT_TEMP template BOOST_MATH_GPU_ENABLED constexpr T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) noexcept(boost::math::is_floating_point::value) { -#ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS - return detail::epsilon(boost::math::integral_constant::is_specialized>()); + // NVRTC does not like this dispatching method so we just skip to where we want to go +#ifndef BOOST_MATH_HAS_NVRTC + #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS + return detail::epsilon(boost::math::integral_constant::is_specialized>()); + #else + return ::boost::math::numeric_limits::is_specialized ? + detail::epsilon(boost::math::true_type()) : + detail::epsilon(boost::math::false_type()); + #endif #else - return ::boost::math::numeric_limits::is_specialized ? - detail::epsilon(boost::math::true_type()) : - detail::epsilon(boost::math::false_type()); + return boost::math::numeric_limits::epsilon(); #endif } -#ifdef BOOST_MATH_HAS_GPU_SUPPORT - -template <> -BOOST_MATH_GPU_ENABLED constexpr float epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(float)) noexcept(true) -{ - return FLT_EPSILON; -} - -template <> -BOOST_MATH_GPU_ENABLED constexpr double epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(double)) noexcept(true) -{ - return DBL_EPSILON; -} - -template <> -BOOST_MATH_GPU_ENABLED constexpr long double epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(long double)) noexcept(true) -{ - return LDBL_EPSILON; -} - - -#endif - namespace detail{ template @@ -301,19 +267,19 @@ BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const boost } template -inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.32927225399135962333569506281281311031656150598474e-9L); } template -inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.1387778780781445675529539585113525390625e-16L); } template -inline T root_epsilon_imp(const T*, const Tag&) +BOOST_MATH_GPU_ENABLED inline T root_epsilon_imp(const T*, const Tag&) { BOOST_MATH_STD_USING static const T r_eps = sqrt(tools::epsilon()); @@ -321,7 +287,7 @@ inline T root_epsilon_imp(const T*, const Tag&) } template -inline T root_epsilon_imp(const T*, const boost::math::integral_constant&) +BOOST_MATH_GPU_ENABLED inline T root_epsilon_imp(const T*, const boost::math::integral_constant&) { BOOST_MATH_STD_USING return sqrt(tools::epsilon()); @@ -340,19 +306,19 @@ BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const boost } template -inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(4.76837158203125e-7L); } template -inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(5.7749313854154005630396773604745549542403508090496e-12L); } template -inline T cbrt_epsilon_imp(const T*, const Tag&) +BOOST_MATH_GPU_ENABLED inline T cbrt_epsilon_imp(const T*, const Tag&) { BOOST_MATH_STD_USING; static const T cbrt_eps = pow(tools::epsilon(), T(1) / 3); @@ -360,7 +326,7 @@ inline T cbrt_epsilon_imp(const T*, const Tag&) } template -inline T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) +BOOST_MATH_GPU_ENABLED inline T cbrt_epsilon_imp(const T*, const boost::math::integral_constant&) { BOOST_MATH_STD_USING; return pow(tools::epsilon(), T(1) / 3); @@ -379,19 +345,19 @@ BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const } template -inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.18145860519450699870567321328132261891067079047605e-4L); } template -inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) +BOOST_MATH_GPU_ENABLED inline constexpr T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) noexcept(boost::math::is_floating_point::value) { return static_cast(0.37252902984619140625e-8L); } template -inline T forth_root_epsilon_imp(const T*, const Tag&) +BOOST_MATH_GPU_ENABLED inline T forth_root_epsilon_imp(const T*, const Tag&) { BOOST_MATH_STD_USING static const T r_eps = sqrt(sqrt(tools::epsilon())); @@ -399,7 +365,7 @@ inline T forth_root_epsilon_imp(const T*, const Tag&) } template -inline T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) +BOOST_MATH_GPU_ENABLED inline T forth_root_epsilon_imp(const T*, const boost::math::integral_constant&) { BOOST_MATH_STD_USING return sqrt(sqrt(tools::epsilon())); From 0805e5e14dc386a3b4ce6bc3a74a4f4a210ca14b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 14:39:59 -0400 Subject: [PATCH 13/35] Add NVRTC support to cauchy dist --- include/boost/math/distributions/cauchy.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/cauchy.hpp b/include/boost/math/distributions/cauchy.hpp index 91c848c91..0992b389e 100644 --- a/include/boost/math/distributions/cauchy.hpp +++ b/include/boost/math/distributions/cauchy.hpp @@ -17,12 +17,18 @@ #include #include #include -#include +#include #include #include #include +#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include #include #include +#endif namespace boost{ namespace math { From 5171e837be11b7ae2cd5e0a5b80ed642aeed35d0 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 14:40:09 -0400 Subject: [PATCH 14/35] Add cauchy dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_cauchy_cdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++++ test/test_cauchy_cdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++++ test/test_cauchy_pdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++++ test/test_cauchy_pdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++++ test/test_cauchy_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++++++ test/test_cauchy_quan_nvrtc_float.cpp | 191 +++++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_cauchy_cdf_nvrtc_double.cpp create mode 100644 test/test_cauchy_cdf_nvrtc_float.cpp create mode 100644 test/test_cauchy_pdf_nvrtc_double.cpp create mode 100644 test/test_cauchy_pdf_nvrtc_float.cpp create mode 100644 test/test_cauchy_quan_nvrtc_double.cpp create mode 100644 test/test_cauchy_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 10e7077d1..2825fb969 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -24,6 +24,13 @@ run test_bernoulli_pdf_nvrtc_float.cpp ; run test_bernoulli_quan_nvrtc_double.cpp ; run test_bernoulli_quan_nvrtc_float.cpp ; +run test_cauchy_cdf_nvrtc_double.cpp ; +run test_cauchy_cdf_nvrtc_float.cpp ; +run test_cauchy_pdf_nvrtc_double.cpp ; +run test_cauchy_pdf_nvrtc_float.cpp ; +run test_cauchy_quan_nvrtc_double.cpp ; +run test_cauchy_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_cauchy_cdf_nvrtc_double.cpp b/test/test_cauchy_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..f9b16b6e0 --- /dev/null +++ b/test/test_cauchy_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::cauchy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::cauchy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cauchy_cdf_nvrtc_float.cpp b/test/test_cauchy_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..0870900ae --- /dev/null +++ b/test/test_cauchy_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::cauchy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::cauchy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cauchy_pdf_nvrtc_double.cpp b/test/test_cauchy_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..24e2808c9 --- /dev/null +++ b/test/test_cauchy_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::cauchy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::cauchy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cauchy_pdf_nvrtc_float.cpp b/test/test_cauchy_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..605d25783 --- /dev/null +++ b/test/test_cauchy_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::cauchy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::cauchy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cauchy_quan_nvrtc_double.cpp b/test/test_cauchy_quan_nvrtc_double.cpp new file mode 100644 index 000000000..08fa01c4f --- /dev/null +++ b/test/test_cauchy_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::cauchy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::cauchy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cauchy_quan_nvrtc_float.cpp b/test/test_cauchy_quan_nvrtc_float.cpp new file mode 100644 index 000000000..6d63d1c2d --- /dev/null +++ b/test/test_cauchy_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cauchy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::cauchy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cauchy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cauchy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cauchy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::cauchy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 9b3c13985dd656b33ab38001c6a22e1274913ba3 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 15:54:15 -0400 Subject: [PATCH 15/35] Add NVRTC support to exponential dist --- include/boost/math/distributions/exponential.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/exponential.hpp b/include/boost/math/distributions/exponential.hpp index 01c442fa8..9d45ac493 100644 --- a/include/boost/math/distributions/exponential.hpp +++ b/include/boost/math/distributions/exponential.hpp @@ -10,12 +10,13 @@ #include #include #include -#include #include #include #include #include #include +#include +#include #ifdef _MSC_VER # pragma warning(push) @@ -23,8 +24,11 @@ # pragma warning(disable: 4702) // unreachable code (return after domain_error throw). #endif +#ifndef BOOST_MATH_HAS_NVRTC +#include #include #include +#endif namespace boost{ namespace math{ From 363f2df6e08e222db7bd07e5c0b3e3bb6eb69056 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 15:54:28 -0400 Subject: [PATCH 16/35] Add exponential dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_exponential_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_exponential_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++ test/test_exponential_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_exponential_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++ test/test_exponential_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++ test/test_exponential_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_exponential_cdf_nvrtc_double.cpp create mode 100644 test/test_exponential_cdf_nvrtc_float.cpp create mode 100644 test/test_exponential_pdf_nvrtc_double.cpp create mode 100644 test/test_exponential_pdf_nvrtc_float.cpp create mode 100644 test/test_exponential_quan_nvrtc_double.cpp create mode 100644 test/test_exponential_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 2825fb969..da734ce07 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -31,6 +31,13 @@ run test_cauchy_pdf_nvrtc_float.cpp ; run test_cauchy_quan_nvrtc_double.cpp ; run test_cauchy_quan_nvrtc_float.cpp ; +run test_exponential_cdf_nvrtc_double.cpp ; +run test_exponential_cdf_nvrtc_float.cpp ; +run test_exponential_pdf_nvrtc_double.cpp ; +run test_exponential_pdf_nvrtc_float.cpp ; +run test_exponential_quan_nvrtc_double.cpp ; +run test_exponential_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_exponential_cdf_nvrtc_double.cpp b/test/test_exponential_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..42849f8ab --- /dev/null +++ b/test/test_exponential_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_exponential_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::exponential_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_exponential_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_exponential_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_exponential_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::exponential_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_exponential_cdf_nvrtc_float.cpp b/test/test_exponential_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..9417abf02 --- /dev/null +++ b/test/test_exponential_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_exponential_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::exponential_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_exponential_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_exponential_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_exponential_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::exponential_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_exponential_pdf_nvrtc_double.cpp b/test/test_exponential_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..2c52a4b38 --- /dev/null +++ b/test/test_exponential_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_exponential_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::exponential_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_exponential_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_exponential_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_exponential_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::exponential_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_exponential_pdf_nvrtc_float.cpp b/test/test_exponential_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..ef1aff3f3 --- /dev/null +++ b/test/test_exponential_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_exponential_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::exponential_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_exponential_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_exponential_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_exponential_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::exponential_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_exponential_quan_nvrtc_double.cpp b/test/test_exponential_quan_nvrtc_double.cpp new file mode 100644 index 000000000..b05f77ffe --- /dev/null +++ b/test/test_exponential_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_exponential_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::exponential_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_exponential_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_exponential_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_exponential_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::exponential_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_exponential_quan_nvrtc_float.cpp b/test/test_exponential_quan_nvrtc_float.cpp new file mode 100644 index 000000000..fb932c5d7 --- /dev/null +++ b/test/test_exponential_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_exponential_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::exponential_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_exponential_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_exponential_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_exponential_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::exponential_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From f650f77527dd01d2f5108d76aaaab6c88da88050 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 16:15:26 -0400 Subject: [PATCH 17/35] Add NVRTC support to extreme value dist --- include/boost/math/distributions/extreme_value.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/boost/math/distributions/extreme_value.hpp b/include/boost/math/distributions/extreme_value.hpp index 0d638092b..73454d29d 100644 --- a/include/boost/math/distributions/extreme_value.hpp +++ b/include/boost/math/distributions/extreme_value.hpp @@ -10,12 +10,14 @@ #include #include #include -#include +#include #include #include #include #include #include +#include +#include // // This is the maximum extreme value distribution, see @@ -24,8 +26,11 @@ // Also known as a Fisher-Tippett distribution, a log-Weibull // distribution or a Gumbel distribution. +#ifndef BOOST_MATH_HAS_NVRTC +#include #include #include +#endif #ifdef _MSC_VER # pragma warning(push) @@ -280,7 +285,7 @@ BOOST_MATH_GPU_ENABLED inline RealType logcdf(const complemented2_type -RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED RealType quantile(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions From e508d498ca7447c4a5f713558363e164d9c1c4de Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 16:15:48 -0400 Subject: [PATCH 18/35] Add NVRTC paths to log_max_value and log_min_value --- include/boost/math/tools/precision.hpp | 32 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp index 096a3265a..1e4bf58ac 100644 --- a/include/boost/math/tools/precision.hpp +++ b/include/boost/math/tools/precision.hpp @@ -208,26 +208,34 @@ struct log_limit_noexcept_traits : public log_limit_noexcept_traits_imp BOOST_MATH_GPU_ENABLED inline constexpr T log_max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(detail::log_limit_noexcept_traits::value) { -#ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS - return detail::log_max_value(typename detail::log_limit_traits::tag_type()); +#ifndef BOOST_MATH_HAS_NVRTC + #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS + return detail::log_max_value(typename detail::log_limit_traits::tag_type()); + #else + BOOST_MATH_ASSERT(::boost::math::numeric_limits::is_specialized); + BOOST_MATH_STD_USING + static const T val = log((boost::math::numeric_limits::max)()); + return val; + #endif #else - BOOST_MATH_ASSERT(::boost::math::numeric_limits::is_specialized); - BOOST_MATH_STD_USING - static const T val = log((boost::math::numeric_limits::max)()); - return val; + return log((boost::math::numeric_limits::max)()); #endif } template BOOST_MATH_GPU_ENABLED inline constexpr T log_min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) noexcept(detail::log_limit_noexcept_traits::value) { -#ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS - return detail::log_min_value(typename detail::log_limit_traits::tag_type()); +#ifndef BOOST_MATH_HAS_NVRTC + #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS + return detail::log_min_value(typename detail::log_limit_traits::tag_type()); + #else + BOOST_MATH_ASSERT(::boost::math::numeric_limits::is_specialized); + BOOST_MATH_STD_USING + static const T val = log((boost::math::numeric_limits::min)()); + return val; + #endif #else - BOOST_MATH_ASSERT(::boost::math::numeric_limits::is_specialized); - BOOST_MATH_STD_USING - static const T val = log((boost::math::numeric_limits::min)()); - return val; + return log((boost::math::numeric_limits::min)()); #endif } From 4217281be5e34a76626133a247ae4196f238173a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 16:16:03 -0400 Subject: [PATCH 19/35] Add extreme value dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_extreme_value_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_extreme_value_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_extreme_value_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_extreme_value_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_extreme_value_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_extreme_value_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_extreme_value_cdf_nvrtc_double.cpp create mode 100644 test/test_extreme_value_cdf_nvrtc_float.cpp create mode 100644 test/test_extreme_value_pdf_nvrtc_double.cpp create mode 100644 test/test_extreme_value_pdf_nvrtc_float.cpp create mode 100644 test/test_extreme_value_quan_nvrtc_double.cpp create mode 100644 test/test_extreme_value_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index da734ce07..9620115d8 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -38,6 +38,13 @@ run test_exponential_pdf_nvrtc_float.cpp ; run test_exponential_quan_nvrtc_double.cpp ; run test_exponential_quan_nvrtc_float.cpp ; +run test_extreme_value_cdf_nvrtc_double.cpp ; +run test_extreme_value_cdf_nvrtc_float.cpp ; +run test_extreme_value_pdf_nvrtc_double.cpp ; +run test_extreme_value_pdf_nvrtc_float.cpp ; +run test_extreme_value_quan_nvrtc_double.cpp ; +run test_extreme_value_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_extreme_value_cdf_nvrtc_double.cpp b/test/test_extreme_value_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..b3600b21b --- /dev/null +++ b/test/test_extreme_value_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_extreme_value_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::extreme_value_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_extreme_value_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_extreme_value_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_extreme_value_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::extreme_value_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_extreme_value_cdf_nvrtc_float.cpp b/test/test_extreme_value_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..b3600b21b --- /dev/null +++ b/test/test_extreme_value_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_extreme_value_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::extreme_value_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_extreme_value_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_extreme_value_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_extreme_value_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::extreme_value_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_extreme_value_pdf_nvrtc_double.cpp b/test/test_extreme_value_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..049f45d89 --- /dev/null +++ b/test/test_extreme_value_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_extreme_value_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::extreme_value_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_extreme_value_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_extreme_value_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_extreme_value_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::extreme_value_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_extreme_value_pdf_nvrtc_float.cpp b/test/test_extreme_value_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..adbd26348 --- /dev/null +++ b/test/test_extreme_value_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_extreme_value_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::extreme_value_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_extreme_value_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_extreme_value_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_extreme_value_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::extreme_value_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_extreme_value_quan_nvrtc_double.cpp b/test/test_extreme_value_quan_nvrtc_double.cpp new file mode 100644 index 000000000..239df9994 --- /dev/null +++ b/test/test_extreme_value_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_extreme_value_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::extreme_value_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_extreme_value_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_extreme_value_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_extreme_value_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::extreme_value_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_extreme_value_quan_nvrtc_float.cpp b/test/test_extreme_value_quan_nvrtc_float.cpp new file mode 100644 index 000000000..fc9d8c5f4 --- /dev/null +++ b/test/test_extreme_value_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_extreme_value_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::extreme_value_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_extreme_value_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_extreme_value_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_extreme_value_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::extreme_value_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 5adc939da7cae0133355aada7f6c0a3172b54545 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 17:03:02 -0400 Subject: [PATCH 20/35] Replace thurst::pair with cuda::std::pair for NVRTC compat --- include/boost/math/tools/config.hpp | 1 + include/boost/math/tools/tuple.hpp | 44 ++++++----------------------- 2 files changed, 9 insertions(+), 36 deletions(-) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index b67ce0ae9..3fa05b351 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -674,6 +674,7 @@ namespace boost{ namespace math{ #include #include #include +#include # define BOOST_MATH_CUDA_ENABLED __host__ __device__ # define BOOST_MATH_HAS_GPU_SUPPORT diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp index fba37fb4f..26f81042f 100644 --- a/include/boost/math/tools/tuple.hpp +++ b/include/boost/math/tools/tuple.hpp @@ -9,50 +9,22 @@ #include -#ifdef BOOST_MATH_HAS_NVRTC +#ifdef BOOST_MATH_ENABLE_CUDA -namespace boost { -namespace math { - -template -class pair -{ -public: - T first; - U second; - - BOOST_MATH_GPU_ENABLED pair(T x, U y) noexcept : first {x}, second {y} {} -}; - -template -BOOST_MATH_GPU_ENABLED auto make_pair(T x, U y) noexcept -{ - return pair(x, y); -} - -} // namespace math -} // namespace boost - - -#elif defined(BOOST_MATH_ENABLE_CUDA) - -#include -#include +#include namespace boost { namespace math { -using thrust::pair; -using thrust::tuple; +using cuda::std::pair; +using cuda::std::tuple; -using thrust::make_pair; -using thrust::make_tuple; +using cuda::std::make_pair; -using thrust::tie; -using thrust::get; +using cuda::std::get; -using thrust::tuple_size; -using thrust::tuple_element; +using cuda::std::tuple_size; +using cuda::std::tuple_element; } // namespace math } // namespace boost From 4dc7070722aa0d7cf869ae958a0a9a69ad05c716 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 14 Aug 2024 17:13:29 -0400 Subject: [PATCH 21/35] Implement make_tuple since it's missing from cuda/std --- include/boost/math/tools/tuple.hpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp index 26f81042f..6a949d5e5 100644 --- a/include/boost/math/tools/tuple.hpp +++ b/include/boost/math/tools/tuple.hpp @@ -11,6 +11,7 @@ #ifdef BOOST_MATH_ENABLE_CUDA +#include #include namespace boost { @@ -26,6 +27,31 @@ using cuda::std::get; using cuda::std::tuple_size; using cuda::std::tuple_element; +namespace detail { + +template +BOOST_MATH_GPU_ENABLED T&& forward(boost::math::remove_reference_t& arg) noexcept +{ + return static_cast(arg); +} + +template +BOOST_MATH_GPU_ENABLED T&& forward(boost::math::remove_reference_t&& arg) noexcept +{ + static_assert(!boost::math::is_lvalue_reference::value, "Cannot forward an rvalue as an lvalue."); + return static_cast(arg); +} + +} // namespace detail + +template +BOOST_MATH_GPU_ENABLED auto make_tuple(T&& t, Ts&&... ts) +{ + return cuda::std::tuple, boost::math::decay_t...>( + boost::math::detail::forward(t), boost::math::detail::forward(ts)... + ); +} + } // namespace math } // namespace boost From 1a8bfbb713d44a7fd896c61e3d745fe8eb55b22e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 09:42:17 -0400 Subject: [PATCH 22/35] Add more macro definitions --- include/boost/math/tools/config.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 3fa05b351..364aa8f54 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -793,6 +793,7 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur #define BOOST_MATH_GPU_ENABLED __host__ __device__ #define BOOST_MATH_STATIC static +#define BOOST_MATH_STATIC_LOCAL_VARIABLE #define BOOST_MATH_NOEXCEPT(T) noexcept(boost::math::is_floating_point_v) #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T) @@ -800,6 +801,7 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast(V) #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING +#define BOOST_MATH_IF_CONSTEXPR if constexpr // This should be defined to nothing but since it is not specifically a math macro // we need to undef before proceeding From 4518351ff9b3dfd0417f6f6ca761cb37ef9f8da4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 09:42:34 -0400 Subject: [PATCH 23/35] Add NVRTC support to holtsmark dist --- .../boost/math/distributions/holtsmark.hpp | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/include/boost/math/distributions/holtsmark.hpp b/include/boost/math/distributions/holtsmark.hpp index 8c976b8c8..04f5484f4 100644 --- a/include/boost/math/distributions/holtsmark.hpp +++ b/include/boost/math/distributions/holtsmark.hpp @@ -15,16 +15,23 @@ #include #include #include -#include +#include #include -#include #include #include #include #include #include +#include +#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#include #include #include +#endif namespace boost { namespace math { template @@ -33,7 +40,7 @@ class holtsmark_distribution; namespace detail { template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -263,7 +270,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_plus_imp_prec(const RealTyp template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -615,14 +622,14 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_plus_imp_prec(const RealTyp template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_imp_prec(const RealType& x, const std::integral_constant &tag) { +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_imp_prec(const RealType& x, const boost::math::integral_constant &tag) { BOOST_MATH_STD_USING // for ADL of std functions return holtsmark_pdf_plus_imp_prec(abs(x), tag); } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_imp_prec(const RealType& x, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_imp_prec(const RealType& x, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING // for ADL of std functions return holtsmark_pdf_plus_imp_prec(abs(x), tag); @@ -655,7 +662,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_pdf_imp(const holtsmark_distrib typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -920,7 +927,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_plus_imp_prec(const RealTyp } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1303,7 +1310,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_plus_imp_prec(const RealTyp } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? holtsmark_cdf_plus_imp_prec(x, tag) : 1 - holtsmark_cdf_plus_imp_prec(x, tag); } @@ -1316,7 +1323,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_imp_prec(const RealType& x, } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? holtsmark_cdf_plus_imp_prec(x, tag) : 1 - holtsmark_cdf_plus_imp_prec(x, tag); } @@ -1355,7 +1362,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_cdf_imp(const holtsmark_distrib typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1634,7 +1641,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_upper_imp_prec(const R template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2230,7 +2237,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_upper_imp_prec(const R } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { return !complement ? holtsmark_quantile_upper_imp_prec(1 - p, tag) : -holtsmark_quantile_upper_imp_prec(1 - p, tag); @@ -2240,7 +2247,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_imp_prec(const RealTyp } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { return !complement ? holtsmark_quantile_upper_imp_prec(1 - p, tag) : -holtsmark_quantile_upper_imp_prec(1 - p, tag); @@ -2277,7 +2284,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_quantile_imp(const holtsmark_di typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_entropy_imp_prec(const boost::math::integral_constant&) { return static_cast(2.06944850513462440032); } template -BOOST_MATH_GPU_ENABLED inline RealType holtsmark_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType holtsmark_entropy_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, 2.0694485051346244003155800384542166381); } @@ -2320,7 +2327,7 @@ BOOST_MATH_GPU_ENABLED inline RealType holtsmark_entropy_imp(const holtsmark_dis typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant Date: Thu, 15 Aug 2024 09:42:48 -0400 Subject: [PATCH 24/35] Add holtsmark dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_holtsmark_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_holtsmark_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++ test/test_holtsmark_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_holtsmark_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++ test/test_holtsmark_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_holtsmark_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_holtsmark_cdf_nvrtc_double.cpp create mode 100644 test/test_holtsmark_cdf_nvrtc_float.cpp create mode 100644 test/test_holtsmark_pdf_nvrtc_double.cpp create mode 100644 test/test_holtsmark_pdf_nvrtc_float.cpp create mode 100644 test/test_holtsmark_quan_nvrtc_double.cpp create mode 100644 test/test_holtsmark_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 9620115d8..93e6deddd 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -45,6 +45,13 @@ run test_extreme_value_pdf_nvrtc_float.cpp ; run test_extreme_value_quan_nvrtc_double.cpp ; run test_extreme_value_quan_nvrtc_float.cpp ; +run test_holtsmark_cdf_nvrtc_double.cpp ; +run test_holtsmark_cdf_nvrtc_float.cpp ; +run test_holtsmark_pdf_nvrtc_double.cpp ; +run test_holtsmark_pdf_nvrtc_float.cpp ; +run test_holtsmark_quan_nvrtc_double.cpp ; +run test_holtsmark_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_holtsmark_cdf_nvrtc_double.cpp b/test/test_holtsmark_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..3b8c9ba94 --- /dev/null +++ b/test/test_holtsmark_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_holtsmark_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::holtsmark_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_holtsmark_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_holtsmark_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_holtsmark_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::holtsmark_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_holtsmark_cdf_nvrtc_float.cpp b/test/test_holtsmark_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..a3ffec594 --- /dev/null +++ b/test/test_holtsmark_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_holtsmark_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::holtsmark_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_holtsmark_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_holtsmark_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_holtsmark_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::holtsmark_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_holtsmark_pdf_nvrtc_double.cpp b/test/test_holtsmark_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..a8355d368 --- /dev/null +++ b/test/test_holtsmark_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_holtsmark_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::holtsmark_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_holtsmark_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_holtsmark_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_holtsmark_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::holtsmark_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_holtsmark_pdf_nvrtc_float.cpp b/test/test_holtsmark_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..7a02d1983 --- /dev/null +++ b/test/test_holtsmark_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_holtsmark_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::holtsmark_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_holtsmark_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_holtsmark_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_holtsmark_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::holtsmark_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_holtsmark_quan_nvrtc_double.cpp b/test/test_holtsmark_quan_nvrtc_double.cpp new file mode 100644 index 000000000..1c2cc61fa --- /dev/null +++ b/test/test_holtsmark_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_holtsmark_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::holtsmark_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_holtsmark_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_holtsmark_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_holtsmark_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::holtsmark_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_holtsmark_quan_nvrtc_float.cpp b/test/test_holtsmark_quan_nvrtc_float.cpp new file mode 100644 index 000000000..a343f232e --- /dev/null +++ b/test/test_holtsmark_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_holtsmark_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::holtsmark_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_holtsmark_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_holtsmark_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_holtsmark_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::holtsmark_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 6ba8cbbbee7c281536219b3534758d92684a472a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 09:59:34 -0400 Subject: [PATCH 25/35] Add NVRTC support to landau dist --- include/boost/math/distributions/landau.hpp | 74 ++++++++++++--------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/include/boost/math/distributions/landau.hpp b/include/boost/math/distributions/landau.hpp index 424931c27..129eca287 100644 --- a/include/boost/math/distributions/landau.hpp +++ b/include/boost/math/distributions/landau.hpp @@ -15,15 +15,22 @@ #include #include #include -#include +#include #include -#include #include #include #include #include +#include +#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#include #include #include +#endif namespace boost { namespace math { template @@ -32,7 +39,7 @@ class landau_distribution; namespace detail { template -BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -345,7 +352,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_plus_imp_prec(const RealType& template -BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -873,7 +880,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_plus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1013,7 +1020,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_minus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1224,7 +1231,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_minus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_imp_prec(const RealType& x, const std::integral_constant &tag) { +BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_imp_prec(const RealType& x, const boost::math::integral_constant &tag) { if (x >= 0) { return landau_pdf_plus_imp_prec(x, tag); } @@ -1237,7 +1244,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_imp_prec(const RealType& x, co } template -BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_imp_prec(const RealType& x, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_imp_prec(const RealType& x, const boost::math::integral_constant& tag) { if (x >= 0) { return landau_pdf_plus_imp_prec(x, tag); } @@ -1277,7 +1284,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_pdf_imp(const landau_distribution< typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1600,7 +1607,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_plus_imp_prec(const RealType& template -BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2123,7 +2130,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_plus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2266,7 +2273,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_minus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2482,7 +2489,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_minus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? landau_cdf_plus_imp_prec(x, tag) : 1 - landau_cdf_plus_imp_prec(x, tag); } @@ -2495,7 +2502,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_imp_prec(const RealType& x, bo } template -BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? landau_cdf_plus_imp_prec(x, tag) : 1 - landau_cdf_plus_imp_prec(x, tag); } @@ -2535,7 +2542,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_cdf_imp(const landau_distribution< typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_lower_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_lower_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2882,7 +2889,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_lower_imp_prec(const Real template -BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_lower_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_lower_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -3542,7 +3549,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_lower_imp_prec(const Real } template -BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -3782,7 +3789,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_upper_imp_prec(const Real template -BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -4229,7 +4236,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_upper_imp_prec(const Real } template -BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { @@ -4240,7 +4247,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { @@ -4279,7 +4286,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_quantile_imp(const landau_distribu typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType landau_mode_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_mode_imp_prec(const boost::math::integral_constant&) { return static_cast(-0.42931452986133525017); } template -BOOST_MATH_GPU_ENABLED inline RealType landau_mode_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_mode_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, -0.42931452986133525016556463510885028346); } @@ -4328,7 +4335,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_mode_imp(const landau_distribution typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType landau_median_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_median_imp_prec(const boost::math::integral_constant&) { return static_cast(0.57563014394507821440); } template -BOOST_MATH_GPU_ENABLED inline RealType landau_median_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_median_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, 0.57563014394507821439627930892257517269); } @@ -4377,7 +4384,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_median_imp(const landau_distributi typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType landau_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_entropy_imp_prec(const boost::math::integral_constant&) { return static_cast(2.37263644000448182448); } template -BOOST_MATH_GPU_ENABLED inline RealType landau_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType landau_entropy_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, 2.3726364400044818244844049010588577710); } @@ -4420,7 +4427,7 @@ BOOST_MATH_GPU_ENABLED inline RealType landau_entropy_imp(const landau_distribut typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant() * log(l_scale); } // landau_distribution From 3eb59a6c228d7090d2b92d13ca1d1366a6899d60 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 09:59:46 -0400 Subject: [PATCH 26/35] Add landau dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_landau_cdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++++ test/test_landau_cdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++++ test/test_landau_pdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++++ test/test_landau_pdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++++ test/test_landau_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++++++ test/test_landau_quan_nvrtc_float.cpp | 191 +++++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_landau_cdf_nvrtc_double.cpp create mode 100644 test/test_landau_cdf_nvrtc_float.cpp create mode 100644 test/test_landau_pdf_nvrtc_double.cpp create mode 100644 test/test_landau_pdf_nvrtc_float.cpp create mode 100644 test/test_landau_quan_nvrtc_double.cpp create mode 100644 test/test_landau_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 93e6deddd..0ed1c9de6 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -52,6 +52,13 @@ run test_holtsmark_pdf_nvrtc_float.cpp ; run test_holtsmark_quan_nvrtc_double.cpp ; run test_holtsmark_quan_nvrtc_float.cpp ; +run test_landau_cdf_nvrtc_double.cpp ; +run test_landau_cdf_nvrtc_float.cpp ; +run test_landau_pdf_nvrtc_double.cpp ; +run test_landau_pdf_nvrtc_float.cpp ; +run test_landau_quan_nvrtc_double.cpp ; +run test_landau_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_landau_cdf_nvrtc_double.cpp b/test/test_landau_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..f23a758e1 --- /dev/null +++ b/test/test_landau_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_landau_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::landau_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_landau_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_landau_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_landau_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::landau_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_landau_cdf_nvrtc_float.cpp b/test/test_landau_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..1a339724e --- /dev/null +++ b/test/test_landau_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_landau_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::landau_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_landau_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_landau_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_landau_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::landau_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_landau_pdf_nvrtc_double.cpp b/test/test_landau_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..1e8df7d07 --- /dev/null +++ b/test/test_landau_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_landau_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::landau_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_landau_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_landau_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_landau_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::landau_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_landau_pdf_nvrtc_float.cpp b/test/test_landau_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..1e8df7d07 --- /dev/null +++ b/test/test_landau_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_landau_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::landau_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_landau_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_landau_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_landau_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::landau_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_landau_quan_nvrtc_double.cpp b/test/test_landau_quan_nvrtc_double.cpp new file mode 100644 index 000000000..f4a5b9553 --- /dev/null +++ b/test/test_landau_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_landau_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::landau_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_landau_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_landau_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_landau_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::landau_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_landau_quan_nvrtc_float.cpp b/test/test_landau_quan_nvrtc_float.cpp new file mode 100644 index 000000000..4a10b497c --- /dev/null +++ b/test/test_landau_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_landau_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::landau_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_landau_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_landau_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_landau_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::landau_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 4a10ab55d71e556a055ac055f1f3542308b6e2d9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:15:21 -0400 Subject: [PATCH 27/35] Add NVRTC support to laplace dist --- include/boost/math/distributions/laplace.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/laplace.hpp b/include/boost/math/distributions/laplace.hpp index 55e2236f9..912a15712 100644 --- a/include/boost/math/distributions/laplace.hpp +++ b/include/boost/math/distributions/laplace.hpp @@ -25,7 +25,8 @@ #include #include #include -#include +#include +#include namespace boost{ namespace math{ From 6ef1632f18c4f5b0f158c7e3116628ba2d2b792d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:15:34 -0400 Subject: [PATCH 28/35] Add laplace dist NVRTC testing --- test/nvrtc_jamfile | 8 + test/test_laplace_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_laplace_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ test/test_laplace_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_laplace_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ test/test_laplace_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_laplace_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ 7 files changed, 1154 insertions(+) create mode 100644 test/test_laplace_cdf_nvrtc_double.cpp create mode 100644 test/test_laplace_cdf_nvrtc_float.cpp create mode 100644 test/test_laplace_pdf_nvrtc_double.cpp create mode 100644 test/test_laplace_pdf_nvrtc_float.cpp create mode 100644 test/test_laplace_quan_nvrtc_double.cpp create mode 100644 test/test_laplace_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 0ed1c9de6..3a9ecb739 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -59,6 +59,14 @@ run test_landau_pdf_nvrtc_float.cpp ; run test_landau_quan_nvrtc_double.cpp ; run test_landau_quan_nvrtc_float.cpp ; +run test_laplace_cdf_nvrtc_double.cpp ; +run test_laplace_cdf_nvrtc_float.cpp ; +run test_laplace_pdf_nvrtc_double.cpp ; +run test_laplace_pdf_nvrtc_float.cpp ; +run test_laplace_quan_nvrtc_double.cpp ; +run test_laplace_quan_nvrtc_float.cpp ; + + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_laplace_cdf_nvrtc_double.cpp b/test/test_laplace_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..9d35b5862 --- /dev/null +++ b/test/test_laplace_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_laplace_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::laplace_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_laplace_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_laplace_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_laplace_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::laplace_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_laplace_cdf_nvrtc_float.cpp b/test/test_laplace_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..aacf59cb0 --- /dev/null +++ b/test/test_laplace_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_laplace_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::laplace_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_laplace_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_laplace_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_laplace_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::laplace_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_laplace_pdf_nvrtc_double.cpp b/test/test_laplace_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..49c386415 --- /dev/null +++ b/test/test_laplace_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_laplace_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::laplace_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_laplace_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_laplace_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_laplace_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::laplace_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_laplace_pdf_nvrtc_float.cpp b/test/test_laplace_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..2100e2cdc --- /dev/null +++ b/test/test_laplace_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_laplace_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::laplace_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_laplace_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_laplace_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_laplace_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::laplace_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_laplace_quan_nvrtc_double.cpp b/test/test_laplace_quan_nvrtc_double.cpp new file mode 100644 index 000000000..cf02db5a5 --- /dev/null +++ b/test/test_laplace_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_laplace_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::laplace_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_laplace_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_laplace_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_laplace_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::laplace_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_laplace_quan_nvrtc_float.cpp b/test/test_laplace_quan_nvrtc_float.cpp new file mode 100644 index 000000000..36472aaa2 --- /dev/null +++ b/test/test_laplace_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_laplace_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::laplace_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_laplace_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_laplace_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_laplace_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::laplace_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 3b9863d99e082bb77ed54ae5ce043738d71bdc2a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:22:06 -0400 Subject: [PATCH 29/35] Add NVRTC support to logistic dist --- include/boost/math/distributions/logistic.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/boost/math/distributions/logistic.hpp b/include/boost/math/distributions/logistic.hpp index 4f94bd8a4..56dc6e9f2 100644 --- a/include/boost/math/distributions/logistic.hpp +++ b/include/boost/math/distributions/logistic.hpp @@ -11,12 +11,14 @@ #include #include #include +#include #include #include #include #include #include -#include +#include +#include namespace boost { namespace math { From e165bfafa42fc05fefd26c5a0d8971c1752eb48f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:22:19 -0400 Subject: [PATCH 30/35] Add logistic dist NVRTC testing --- test/nvrtc_jamfile | 6 + test/test_logistic_cdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++ test/test_logistic_cdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++ test/test_logistic_pdf_nvrtc_double.cpp | 191 +++++++++++++++++++++++ test/test_logistic_pdf_nvrtc_float.cpp | 191 +++++++++++++++++++++++ test/test_logistic_quan_nvrtc_double.cpp | 191 +++++++++++++++++++++++ test/test_logistic_quan_nvrtc_float.cpp | 191 +++++++++++++++++++++++ 7 files changed, 1152 insertions(+) create mode 100644 test/test_logistic_cdf_nvrtc_double.cpp create mode 100644 test/test_logistic_cdf_nvrtc_float.cpp create mode 100644 test/test_logistic_pdf_nvrtc_double.cpp create mode 100644 test/test_logistic_pdf_nvrtc_float.cpp create mode 100644 test/test_logistic_quan_nvrtc_double.cpp create mode 100644 test/test_logistic_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 3a9ecb739..c0af6e7fa 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -66,6 +66,12 @@ run test_laplace_pdf_nvrtc_float.cpp ; run test_laplace_quan_nvrtc_double.cpp ; run test_laplace_quan_nvrtc_float.cpp ; +run test_logistic_cdf_nvrtc_double.cpp ; +run test_logistic_cdf_nvrtc_float.cpp ; +run test_logistic_pdf_nvrtc_double.cpp ; +run test_logistic_pdf_nvrtc_float.cpp ; +run test_logistic_quan_nvrtc_double.cpp ; +run test_logistic_quan_nvrtc_float.cpp ; # Special Functions run test_cbrt_nvrtc_double.cpp ; diff --git a/test/test_logistic_cdf_nvrtc_double.cpp b/test/test_logistic_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..5548dc412 --- /dev/null +++ b/test/test_logistic_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_logistic_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::logistic_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_logistic_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_logistic_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_logistic_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::logistic_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_logistic_cdf_nvrtc_float.cpp b/test/test_logistic_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..ea65cc97c --- /dev/null +++ b/test/test_logistic_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_logistic_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::logistic_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_logistic_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_logistic_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_logistic_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::logistic_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_logistic_pdf_nvrtc_double.cpp b/test/test_logistic_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..d287225cf --- /dev/null +++ b/test/test_logistic_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_logistic_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::logistic_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_logistic_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_logistic_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_logistic_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::logistic_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_logistic_pdf_nvrtc_float.cpp b/test/test_logistic_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..9339a6db3 --- /dev/null +++ b/test/test_logistic_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_logistic_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::logistic_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_logistic_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_logistic_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_logistic_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::logistic_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_logistic_quan_nvrtc_double.cpp b/test/test_logistic_quan_nvrtc_double.cpp new file mode 100644 index 000000000..f763db82c --- /dev/null +++ b/test/test_logistic_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_logistic_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::logistic_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_logistic_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_logistic_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_logistic_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::logistic_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_logistic_quan_nvrtc_float.cpp b/test/test_logistic_quan_nvrtc_float.cpp new file mode 100644 index 000000000..b14c3c5da --- /dev/null +++ b/test/test_logistic_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_logistic_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::logistic_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_logistic_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_logistic_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_logistic_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::logistic_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 4f2eb0218f259c59fb15b3fb6f9d5e3e7911f8bd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:40:15 -0400 Subject: [PATCH 31/35] Disable whole file on NVRTC --- include/boost/math/tools/big_constant.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/boost/math/tools/big_constant.hpp b/include/boost/math/tools/big_constant.hpp index 2fdfec782..0d54976bc 100644 --- a/include/boost/math/tools/big_constant.hpp +++ b/include/boost/math/tools/big_constant.hpp @@ -8,6 +8,12 @@ #define BOOST_MATH_TOOLS_BIG_CONSTANT_HPP #include + +// On NVRTC we don't need any of this +// We just have a simple definition of the macro since the largest float +// type on the platform is a 64-bit double +#ifndef BOOST_MATH_HAS_NVRTC + #ifndef BOOST_MATH_STANDALONE #include #endif @@ -94,5 +100,7 @@ inline constexpr T make_big_value(largest_float, const char* s, std::false_type }}} // namespaces +#endif // BOOST_MATH_HAS_NVRTC + #endif From 20e3a7ea87c7895243c8d7aec0876f78b93d7149 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:40:28 -0400 Subject: [PATCH 32/35] Add NVRTC support to mapairy dist --- include/boost/math/distributions/mapairy.hpp | 69 +++++++++++--------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/include/boost/math/distributions/mapairy.hpp b/include/boost/math/distributions/mapairy.hpp index 542cf6da3..8bf1f990c 100644 --- a/include/boost/math/distributions/mapairy.hpp +++ b/include/boost/math/distributions/mapairy.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -23,8 +22,14 @@ #include #include #include -#include +#include +#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include #include +#endif namespace boost { namespace math { template @@ -33,7 +38,7 @@ class mapairy_distribution; namespace detail { template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -255,7 +260,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_plus_imp_prec(const RealType& template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -584,7 +589,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_plus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -757,7 +762,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_minus_imp_prec(const RealType } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1048,7 +1053,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_minus_imp_prec(const RealType } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_imp_prec(const RealType& x, const std::integral_constant &tag) { +BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_imp_prec(const RealType& x, const boost::math::integral_constant &tag) { if (x >= 0) { return mapairy_pdf_plus_imp_prec(x, tag); } @@ -1061,7 +1066,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_imp_prec(const RealType& x, c } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_imp_prec(const RealType& x, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_imp_prec(const RealType& x, const boost::math::integral_constant& tag) { if (x >= 0) { return mapairy_pdf_plus_imp_prec(x, tag); } @@ -1100,7 +1105,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_pdf_imp(const mapairy_distributio typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1331,7 +1336,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_plus_imp_prec(const RealType& template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1656,7 +1661,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_plus_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1833,7 +1838,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_minus_imp_prec(const RealType } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_minus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_minus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2133,7 +2138,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_minus_imp_prec(const RealType } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? mapairy_cdf_plus_imp_prec(x, tag) : 1 - mapairy_cdf_plus_imp_prec(x, tag); } @@ -2146,7 +2151,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_imp_prec(const RealType& x, b } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? mapairy_cdf_plus_imp_prec(x, tag) : 1 - mapairy_cdf_plus_imp_prec(x, tag); } @@ -2185,7 +2190,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_cdf_imp(const mapairy_distributio typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_lower_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_lower_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2534,7 +2539,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_lower_imp_prec(const Rea template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_lower_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_lower_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -3236,7 +3241,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_lower_imp_prec(const Rea } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -3440,7 +3445,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_upper_imp_prec(const Rea template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -3838,7 +3843,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_upper_imp_prec(const Rea } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { return !complement ? mapairy_quantile_upper_imp_prec(1 - p, tag) : mapairy_quantile_lower_imp_prec(1 - p, tag); @@ -3848,7 +3853,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_imp_prec(const RealType& } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { return !complement ? mapairy_quantile_upper_imp_prec(1 - p, tag) : mapairy_quantile_lower_imp_prec(1 - p, tag); @@ -3885,7 +3890,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_quantile_imp(const mapairy_distri typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType mapairy_mode_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_mode_imp_prec(const boost::math::integral_constant&) { return static_cast(-1.16158727113597068525); } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_mode_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_mode_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, -1.1615872711359706852500000803029112987); } @@ -3933,7 +3938,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_mode_imp(const mapairy_distributi typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType mapairy_median_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_median_imp_prec(const boost::math::integral_constant&) { return static_cast(-0.71671068545502205332); } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_median_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_median_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, -0.71671068545502205331700196278067230944440); } @@ -3981,7 +3986,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_median_imp(const mapairy_distribu typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType mapairy_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_entropy_imp_prec(const boost::math::integral_constant&) { return static_cast(2.00727681841065634600); } template -BOOST_MATH_GPU_ENABLED inline RealType mapairy_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType mapairy_entropy_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, 2.0072768184106563460003025875575283708); } @@ -4024,7 +4029,7 @@ BOOST_MATH_GPU_ENABLED inline RealType mapairy_entropy_imp(const mapairy_distrib typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant Date: Thu, 15 Aug 2024 10:40:47 -0400 Subject: [PATCH 33/35] Add mapairy dist NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_mapairy_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_mapairy_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ test/test_mapairy_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_mapairy_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ test/test_mapairy_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++++++ test/test_mapairy_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_mapairy_cdf_nvrtc_double.cpp create mode 100644 test/test_mapairy_cdf_nvrtc_float.cpp create mode 100644 test/test_mapairy_pdf_nvrtc_double.cpp create mode 100644 test/test_mapairy_pdf_nvrtc_float.cpp create mode 100644 test/test_mapairy_quan_nvrtc_double.cpp create mode 100644 test/test_mapairy_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index c0af6e7fa..fa25d0226 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -73,6 +73,13 @@ run test_logistic_pdf_nvrtc_float.cpp ; run test_logistic_quan_nvrtc_double.cpp ; run test_logistic_quan_nvrtc_float.cpp ; +run test_mapairy_cdf_nvrtc_double.cpp ; +run test_mapairy_cdf_nvrtc_float.cpp ; +run test_mapairy_pdf_nvrtc_double.cpp ; +run test_mapairy_pdf_nvrtc_float.cpp ; +run test_mapairy_quan_nvrtc_double.cpp ; +run test_mapairy_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_mapairy_cdf_nvrtc_double.cpp b/test/test_mapairy_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..87e7948b7 --- /dev/null +++ b/test/test_mapairy_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_mapairy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::mapairy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_mapairy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_mapairy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_mapairy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::mapairy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_mapairy_cdf_nvrtc_float.cpp b/test/test_mapairy_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..d84404c5e --- /dev/null +++ b/test/test_mapairy_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_mapairy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::mapairy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_mapairy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_mapairy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_mapairy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::mapairy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_mapairy_pdf_nvrtc_double.cpp b/test/test_mapairy_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..5f461d6d1 --- /dev/null +++ b/test/test_mapairy_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_mapairy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::mapairy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_mapairy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_mapairy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_mapairy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::mapairy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_mapairy_pdf_nvrtc_float.cpp b/test/test_mapairy_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..39eb4152d --- /dev/null +++ b/test/test_mapairy_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_mapairy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::mapairy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_mapairy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_mapairy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_mapairy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::mapairy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_mapairy_quan_nvrtc_double.cpp b/test/test_mapairy_quan_nvrtc_double.cpp new file mode 100644 index 000000000..43ac17c84 --- /dev/null +++ b/test/test_mapairy_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_mapairy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::mapairy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_mapairy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_mapairy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_mapairy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::mapairy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_mapairy_quan_nvrtc_float.cpp b/test/test_mapairy_quan_nvrtc_float.cpp new file mode 100644 index 000000000..a127843e9 --- /dev/null +++ b/test/test_mapairy_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_mapairy_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::mapairy_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_mapairy_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_mapairy_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_mapairy_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::mapairy_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 64a92dd527c2bce76201f8a068b2ca7ab3328ff9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 15 Aug 2024 10:50:38 -0400 Subject: [PATCH 34/35] Add NVRTC support to saspoint5 dist --- .../boost/math/distributions/saspoint5.hpp | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/include/boost/math/distributions/saspoint5.hpp b/include/boost/math/distributions/saspoint5.hpp index 3e78fca97..7846b9956 100644 --- a/include/boost/math/distributions/saspoint5.hpp +++ b/include/boost/math/distributions/saspoint5.hpp @@ -15,15 +15,21 @@ #include #include #include -#include #include #include #include #include #include #include -#include +#include +#include +#include +#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include #include +#endif namespace boost { namespace math { template @@ -32,7 +38,7 @@ class saspoint5_distribution; namespace detail { template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -337,7 +343,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_plus_imp_prec(const RealTyp template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -849,14 +855,14 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_plus_imp_prec(const RealTyp template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_imp_prec(const RealType& x, const std::integral_constant &tag) { +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_imp_prec(const RealType& x, const boost::math::integral_constant &tag) { BOOST_MATH_STD_USING // for ADL of std functions return saspoint5_pdf_plus_imp_prec(abs(x), tag); } template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_imp_prec(const RealType& x, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_imp_prec(const RealType& x, const boost::math::integral_constant& tag) { BOOST_MATH_STD_USING // for ADL of std functions return saspoint5_pdf_plus_imp_prec(abs(x), tag); @@ -889,7 +895,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_pdf_imp(const saspoint5_distrib typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1163,7 +1169,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_plus_imp_prec(const RealTyp template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_plus_imp_prec(const RealType& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_plus_imp_prec(const RealType& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1642,7 +1648,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_plus_imp_prec(const RealTyp } template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? saspoint5_cdf_plus_imp_prec(x, tag) : 1 - saspoint5_cdf_plus_imp_prec(x, tag); } @@ -1655,7 +1661,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_imp_prec(const RealType& x, } template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_imp_prec(const RealType& x, bool complement, const std::integral_constant& tag) { +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_imp_prec(const RealType& x, bool complement, const boost::math::integral_constant& tag) { if (x >= 0) { return complement ? saspoint5_cdf_plus_imp_prec(x, tag) : 1 - saspoint5_cdf_plus_imp_prec(x, tag); } @@ -1694,7 +1700,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_cdf_imp(const saspoint5_distrib typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -1994,7 +2000,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_upper_imp_prec(const R template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_upper_imp_prec(const RealType& p, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_upper_imp_prec(const RealType& p, const boost::math::integral_constant&) { BOOST_MATH_STD_USING RealType result; @@ -2493,7 +2499,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_upper_imp_prec(const R } template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { return !complement ? saspoint5_quantile_upper_imp_prec(1 - p, tag) : -saspoint5_quantile_upper_imp_prec(1 - p, tag); @@ -2503,7 +2509,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_imp_prec(const RealTyp } template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_imp_prec(const RealType& p, bool complement, const std::integral_constant& tag) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_imp_prec(const RealType& p, bool complement, const boost::math::integral_constant& tag) { if (p > 0.5) { return !complement ? saspoint5_quantile_upper_imp_prec(1 - p, tag) : -saspoint5_quantile_upper_imp_prec(1 - p, tag); @@ -2540,7 +2546,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_quantile_imp(const saspoint5_di typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_entropy_imp_prec(const boost::math::integral_constant&) { return static_cast(3.63992444568030649573); } template -BOOST_MATH_GPU_ENABLED inline RealType saspoint5_entropy_imp_prec(const std::integral_constant&) +BOOST_MATH_GPU_ENABLED inline RealType saspoint5_entropy_imp_prec(const boost::math::integral_constant&) { return BOOST_MATH_BIG_CONSTANT(RealType, 113, 3.6399244456803064957308496039071853510); } @@ -2583,7 +2589,7 @@ BOOST_MATH_GPU_ENABLED inline RealType saspoint5_entropy_imp(const saspoint5_dis typedef typename tools::promote_args::type result_type; typedef typename policies::precision::type precision_type; - typedef std::integral_constant Date: Thu, 15 Aug 2024 10:50:50 -0400 Subject: [PATCH 35/35] Add saspoint5 NVRTC testing --- test/nvrtc_jamfile | 7 + test/test_saspoint5_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_saspoint5_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++ test/test_saspoint5_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_saspoint5_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++++++ test/test_saspoint5_quan_nvrtc_double.cpp | 191 ++++++++++++++++++++++ test/test_saspoint5_quan_nvrtc_float.cpp | 191 ++++++++++++++++++++++ 7 files changed, 1153 insertions(+) create mode 100644 test/test_saspoint5_cdf_nvrtc_double.cpp create mode 100644 test/test_saspoint5_cdf_nvrtc_float.cpp create mode 100644 test/test_saspoint5_pdf_nvrtc_double.cpp create mode 100644 test/test_saspoint5_pdf_nvrtc_float.cpp create mode 100644 test/test_saspoint5_quan_nvrtc_double.cpp create mode 100644 test/test_saspoint5_quan_nvrtc_float.cpp diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index fa25d0226..8d33eabb5 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -80,6 +80,13 @@ run test_mapairy_pdf_nvrtc_float.cpp ; run test_mapairy_quan_nvrtc_double.cpp ; run test_mapairy_quan_nvrtc_float.cpp ; +run test_saspoint5_cdf_nvrtc_double.cpp ; +run test_saspoint5_cdf_nvrtc_float.cpp ; +run test_saspoint5_pdf_nvrtc_double.cpp ; +run test_saspoint5_pdf_nvrtc_float.cpp ; +run test_saspoint5_quan_nvrtc_double.cpp ; +run test_saspoint5_quan_nvrtc_float.cpp ; + # Special Functions run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; diff --git a/test/test_saspoint5_cdf_nvrtc_double.cpp b/test/test_saspoint5_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..ff2067fa0 --- /dev/null +++ b/test/test_saspoint5_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_saspoint5_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::saspoint5_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_saspoint5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_saspoint5_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_saspoint5_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::saspoint5_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_saspoint5_cdf_nvrtc_float.cpp b/test/test_saspoint5_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..11c816da1 --- /dev/null +++ b/test/test_saspoint5_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_saspoint5_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::saspoint5_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_saspoint5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_saspoint5_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_saspoint5_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::saspoint5_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_saspoint5_pdf_nvrtc_double.cpp b/test/test_saspoint5_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..4c7444383 --- /dev/null +++ b/test/test_saspoint5_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_saspoint5_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::saspoint5_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_saspoint5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_saspoint5_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_saspoint5_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::saspoint5_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_saspoint5_pdf_nvrtc_float.cpp b/test/test_saspoint5_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..8cd93aaa9 --- /dev/null +++ b/test/test_saspoint5_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_saspoint5_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::saspoint5_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_saspoint5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_saspoint5_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_saspoint5_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::saspoint5_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_saspoint5_quan_nvrtc_double.cpp b/test/test_saspoint5_quan_nvrtc_double.cpp new file mode 100644 index 000000000..631ac4b24 --- /dev/null +++ b/test/test_saspoint5_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_saspoint5_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::saspoint5_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_saspoint5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_saspoint5_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_saspoint5_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::saspoint5_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_saspoint5_quan_nvrtc_float.cpp b/test/test_saspoint5_quan_nvrtc_float.cpp new file mode 100644 index 000000000..fa152622e --- /dev/null +++ b/test/test_saspoint5_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_saspoint5_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::saspoint5_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_saspoint5_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_saspoint5_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_saspoint5_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::saspoint5_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +}