Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Half factorization #1712

Open
wants to merge 9 commits into
base: half_solver
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions cmake/create_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,6 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda --expt-relaxed-constexpr>)
endif()
# we handle CUDA architecture flags for now, disable CMake handling
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF)
endif()
ginkgo_set_test_target_properties(${test_target_name} "_cuda" ${ARGN})
ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu)
endfunction(ginkgo_create_cuda_test_internal)
Expand Down
12 changes: 12 additions & 0 deletions common/cuda_hip/base/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,18 @@ __device__ __forceinline__ __half sqrt(const __half& val)
}


// using overload here. Otherwise, compiler still think the is_finite
// specialization is still __host__ __device__ function.
__device__ __forceinline__ bool is_finite(const __half& value)
{
return abs(value) < device_numeric_limits<__half>::inf();
}

__device__ __forceinline__ bool is_finite(const thrust::complex<__half>& value)
{
return is_finite(value.real()) && is_finite(value.imag());
}

#endif


Expand Down
242 changes: 242 additions & 0 deletions common/cuda_hip/components/memory.nvidia.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1031,3 +1031,245 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
"d"(real_result), "d"(imag_result)
: "memory");
}


__device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
{
float result;
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
#if __CUDA_ARCH__ < 700
" ld.volatile.b16 t, [%1];\n\t"
#else
" ld.relaxed.cta.b16 t, [%1];\n\t"
#endif
" cvt.f32.f16 %0, t;\n\t"
"}"
: "=f"(result)
: "l"(const_cast<__half*>(ptr))
: "memory");

return static_cast<__half>(result);
}


__device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result)
{
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
" cvt.rn.f16.f32 t, %1;\n\t"
#if __CUDA_ARCH__ < 700
" st.volatile.b16 [%0], t;\n\t"
#else
" st.relaxed.cta.b16 [%0], t;\n\t"
#endif
"}" ::"l"(ptr),
"f"(static_cast<float>(result))
: "memory");
}


__device__ __forceinline__ __half load_acquire_local(const __half* ptr)
{
float result;
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
#if __CUDA_ARCH__ < 700
" ld.volatile.b16 t, [%1];\n\t"
#else
" ld.acquire.cta.b16 t, [%1];\n\t"
#endif
" cvt.f32.f16 %0, t;\n\t"
"}"
: "=f"(result)
: "l"(const_cast<__half*>(ptr))
: "memory");
membar_acq_rel_local();
return static_cast<__half>(result);
}


__device__ __forceinline__ void store_release_local(__half* ptr, __half result)
{
membar_acq_rel_local();
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
" cvt.rn.f16.f32 t, %1;\n\t"
#if __CUDA_ARCH__ < 700
" st.volatile.b16 [%0], t;\n\t"
#else
" st.release.cta.b16 [%0], t;\n\t"
#endif
"}" ::"l"(ptr),
"f"(static_cast<float>(result))
: "memory");
}


__device__ __forceinline__ __half load_relaxed(const __half* ptr)
{
float result;
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
#if __CUDA_ARCH__ < 700
" ld.volatile.b16 t, [%1];\n\t"
#else
" ld.relaxed.gpu.b16 t, [%1];\n\t"
#endif
" cvt.f32.f16 %0, t;\n\t"
"}"
: "=f"(result)
: "l"(const_cast<__half*>(ptr))
: "memory");

return static_cast<__half>(result);
}


__device__ __forceinline__ void store_relaxed(__half* ptr, __half result)
{
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
" cvt.rn.f16.f32 t, %1;\n\t"
#if __CUDA_ARCH__ < 700
" st.volatile.b16 [%0], t;\n\t"
#else
" st.relaxed.gpu.b16 [%0], t;\n\t"
#endif
"}" ::"l"(ptr),
"f"(static_cast<float>(result))
: "memory");
}


__device__ __forceinline__ __half load_acquire(const __half* ptr)
{
float result;
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
#if __CUDA_ARCH__ < 700
" ld.volatile.b16 t, [%1];\n\t"
#else
" ld.acquire.gpu.b16 t, [%1];\n\t"
#endif
" cvt.f32.f16 %0, t;\n\t"
"}"
: "=f"(result)
: "l"(const_cast<__half*>(ptr))
: "memory");
membar_acq_rel();
return static_cast<__half>(result);
}


__device__ __forceinline__ void store_release(__half* ptr, __half result)
{
membar_acq_rel();
asm volatile(
"{\n\t"
" .reg .f16 t;\n\t"
" cvt.rn.f16.f32 t, %1;\n\t"
#if __CUDA_ARCH__ < 700
" st.volatile.b16 [%0], t;\n\t"
#else
" st.release.gpu.b16 [%0], t;\n\t"
#endif
"}" ::"l"(ptr),
"f"(static_cast<float>(result))
: "memory");
}


__device__ __forceinline__ thrust::complex<__half> load_relaxed_local(
const thrust::complex<__half>* ptr)
{
float real_result;
float imag_result;
asm volatile(
"{\n\t"
" .reg .v2 .f16 t;\n\t"
#if __CUDA_ARCH__ < 700
"ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
#else
"ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t"
#endif
" cvt.f32.f16 %0, t.x;\n\t"
" cvt.f32.f16 %1, t.y;\n\t"
"}"
: "=f"(real_result), "=f"(imag_result)
: "l"(const_cast<thrust::complex<__half>*>(ptr))
: "memory");
return thrust::complex<__half>{real_result, imag_result};
}


__device__ __forceinline__ void store_relaxed_local(
thrust::complex<__half>* ptr, thrust::complex<__half> result)
{
auto real_result = static_cast<float>(result.real());
auto imag_result = static_cast<float>(result.imag());
asm volatile(
"{\n\t"
" .reg .v2 .f16 t;\n\t"
" cvt.rn.f16.f32 t.x, %1;\n\t"
" cvt.rn.f16.f32 t.y, %2;\n\t"
#if __CUDA_ARCH__ < 700
"st.volatile.v2.b16 [%0], t;\n\t"
#else
"st.relaxed.cta.v2.b16 [%0], t;\n\t"
#endif
"}" ::"l"(ptr),
"f"(real_result), "f"(imag_result)
: "memory");
}


__device__ __forceinline__ thrust::complex<__half> load_relaxed(
const thrust::complex<__half>* ptr)
{
float real_result;
float imag_result;
asm volatile(
"{\n\t"
" .reg .v2 .f16 t;\n\t"
#if __CUDA_ARCH__ < 700
"ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
#else
"ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t"
#endif
" cvt.f32.f16 %0, t.x;\n\t"
" cvt.f32.f16 %1, t.y;\n\t"
"}"
: "=f"(real_result), "=f"(imag_result)
: "l"(const_cast<thrust::complex<__half>*>(ptr))
: "memory");
return thrust::complex<__half>{real_result, imag_result};
}


__device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr,
thrust::complex<__half> result)
{
auto real_result = static_cast<float>(result.real());
auto imag_result = static_cast<float>(result.imag());
asm volatile(
"{\n\t"
" .reg .v2 .f16 t;\n\t"
" cvt.rn.f16.f32 t.x, %1;\n\t"
" cvt.rn.f16.f32 t.y, %2;\n\t"
#if __CUDA_ARCH__ < 700
"st.volatile.v2.b16 [%0], t;\n\t"
#else
"st.relaxed.gpu.v2.b16 [%0], t;\n\t"
#endif
"}" ::"l"(ptr),
"f"(real_result), "f"(imag_result)
: "memory");
}
12 changes: 7 additions & 5 deletions common/cuda_hip/factorization/cholesky_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ void symbolic_factorize(
postorder, postorder_parent, out_row_ptrs, out_cols);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);


Expand Down Expand Up @@ -312,7 +312,7 @@ void forest_from_factor(
build_children_from_parents(exec, forest);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);


Expand Down Expand Up @@ -346,7 +346,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
transpose_idxs);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_INITIALIZE);


template <typename ValueType, typename IndexType>
Expand All @@ -372,7 +373,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_FACTORIZE);


template <typename ValueType, typename IndexType>
Expand Down Expand Up @@ -428,7 +430,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);


Expand Down
10 changes: 5 additions & 5 deletions common/cuda_hip/factorization/factorization_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
mtx_builder.get_col_idx_array() = std::move(new_col_idx_array);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);


Expand Down Expand Up @@ -385,7 +385,7 @@ void initialize_row_ptrs_l_u(
components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);


Expand Down Expand Up @@ -418,7 +418,7 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);


Expand Down Expand Up @@ -446,7 +446,7 @@ void initialize_row_ptrs_l(
components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);


Expand Down Expand Up @@ -481,7 +481,7 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);


Expand Down
3 changes: 2 additions & 1 deletion common/cuda_hip/factorization/ic_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
sparselib::destroy(desc);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_IC_COMPUTE_KERNEL);


} // namespace ic_factorization
Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/factorization/ilu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
sparselib::destroy(desc);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);


Expand Down
Loading
Loading