diff --git a/.wordlist.txt b/.wordlist.txt
index 2033214571..b3b8686678 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -6,6 +6,7 @@ APU
APUs
AQL
AXPY
+asm
Asynchrony
backtrace
Bitcode
@@ -15,6 +16,7 @@ builtins
Builtins
CAS
clr
+compilable
coroutines
Ctx
cuBLASLt
@@ -42,12 +44,14 @@ extern
fatbin
fatbinary
foundationally
+framebuffer
frontends
fnuz
FNUZ
fp
gedit
GPGPU
+GROMACS
GWS
hardcoded
HC
@@ -58,6 +62,7 @@ hipcc
hipCtx
hipexamine
hipified
+HIPify
hipModule
hipModuleLaunchKernel
hipother
@@ -65,9 +70,12 @@ HIPRTC
icc
IILE
iGPU
+inlined
inplace
-Interoperation
+interop
+interoperation
interoperate
+interoperation
Interprocess
interprocess
Intrinsics
@@ -75,6 +83,7 @@ intrinsics
IPC
IPs
isa
+iteratively
Lapack
latencies
libc
@@ -87,6 +96,8 @@ ltrace
makefile
Malloc
malloc
+MALU
+MiB
memset
multicore
multigrid
@@ -101,9 +112,12 @@ NOP
Numa
Nsight
ocp
+omnitrace
overindex
overindexing
oversubscription
+overutilized
+parallelizable
pixelated
pragmas
preallocated
@@ -111,6 +125,7 @@ preconditioners
predefining
prefetched
preprocessor
+profilers
PTX
PyHIP
queryable
@@ -118,6 +133,7 @@ prefetching
quad
representable
RMW
+rocgdb
ROCm's
rocTX
roundtrip
@@ -129,6 +145,7 @@ scalarizing
sceneries
shaders
SIMT
+sinewave
SOMA
SPMV
structs
@@ -139,11 +156,16 @@ texels
tradeoffs
templated
toolkits
+transfering
typedefs
unintuitive
UMM
unmap
+unmapped
+unmapping
+unregister
upscaled
variadic
+vulkan
WinGDB
-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+zc
diff --git a/README.md b/README.md
index d5f278f3c2..610b2a89c7 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ HIP releases are typically naming convention for each ROCM release to help diffe
## More Info
* [Installation](docs/install/install.rst)
-* [HIP FAQ](docs/how-to/faq.md)
+* [HIP FAQ](docs/faq.rst)
* [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst)
* [HIP Porting Guide](docs/how-to/hip_porting_guide.md)
* [HIP Porting Driver Guide](docs/how-to/hip_porting_driver_api.md)
diff --git a/docs/conf.py b/docs/conf.py
index 82bcefee89..aed3ead08d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -47,8 +47,8 @@
numfig = False
-
exclude_patterns = [
"doxygen/mainpage.md",
- "understand/glossary.md"
+ "understand/glossary.md",
+ 'how-to/debugging_env.rst'
]
\ No newline at end of file
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.drawio
similarity index 100%
rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio
rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.drawio
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg
similarity index 100%
rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.drawio
similarity index 100%
rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio
rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.drawio
diff --git a/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg b/docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg
similarity index 100%
rename from docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
rename to docs/data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg
diff --git a/docs/data/how-to/hipgraph/hip_graph.drawio b/docs/data/how-to/hip_runtime_api/hipgraph/hip_graph.drawio
similarity index 100%
rename from docs/data/how-to/hipgraph/hip_graph.drawio
rename to docs/data/how-to/hip_runtime_api/hipgraph/hip_graph.drawio
diff --git a/docs/data/how-to/hipgraph/hip_graph.svg b/docs/data/how-to/hip_runtime_api/hipgraph/hip_graph.svg
similarity index 100%
rename from docs/data/how-to/hipgraph/hip_graph.svg
rename to docs/data/how-to/hip_runtime_api/hipgraph/hip_graph.svg
diff --git a/docs/data/how-to/hipgraph/hip_graph_speedup.drawio b/docs/data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.drawio
similarity index 100%
rename from docs/data/how-to/hipgraph/hip_graph_speedup.drawio
rename to docs/data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.drawio
diff --git a/docs/data/how-to/hipgraph/hip_graph_speedup.svg b/docs/data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.svg
similarity index 100%
rename from docs/data/how-to/hipgraph/hip_graph_speedup.svg
rename to docs/data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.svg
diff --git a/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.drawio b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.drawio
new file mode 100644
index 0000000000..602c7e501d
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.drawio
@@ -0,0 +1,106 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg
new file mode 100644
index 0000000000..8ffb8aa965
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/understand/textures/border.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/border.png
similarity index 100%
rename from docs/data/understand/textures/border.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/border.png
diff --git a/docs/data/understand/textures/clamp.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/clamp.png
similarity index 100%
rename from docs/data/understand/textures/clamp.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/clamp.png
diff --git a/docs/data/understand/textures/linear.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/linear.png
similarity index 100%
rename from docs/data/understand/textures/linear.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/linear.png
diff --git a/docs/data/understand/textures/mirror.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/mirror.png
similarity index 100%
rename from docs/data/understand/textures/mirror.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/mirror.png
diff --git a/docs/data/understand/textures/nearest.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/nearest.png
similarity index 100%
rename from docs/data/understand/textures/nearest.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/nearest.png
diff --git a/docs/data/understand/textures/original.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/original.png
similarity index 100%
rename from docs/data/understand/textures/original.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/original.png
diff --git a/docs/data/understand/textures/wrap.png b/docs/data/how-to/hip_runtime_api/memory_management/textures/wrap.png
similarity index 100%
rename from docs/data/understand/textures/wrap.png
rename to docs/data/how-to/hip_runtime_api/memory_management/textures/wrap.png
diff --git a/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.drawio b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.drawio
new file mode 100644
index 0000000000..1deeca61f5
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.drawio
@@ -0,0 +1,1880 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
new file mode 100644
index 0000000000..83accc3b27
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
@@ -0,0 +1,9 @@
+
\ No newline at end of file
diff --git a/docs/data/how-to/hip_runtime_api/runtimes.drawio b/docs/data/how-to/hip_runtime_api/runtimes.drawio
new file mode 100644
index 0000000000..a01c453452
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/runtimes.drawio
@@ -0,0 +1,127 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/how-to/hip_runtime_api/runtimes.svg b/docs/data/how-to/hip_runtime_api/runtimes.svg
new file mode 100644
index 0000000000..a64a7a54dc
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/runtimes.svg
@@ -0,0 +1,2 @@
+
\ No newline at end of file
diff --git a/docs/data/how-to/hip_runtime_api/stream_management.drawio b/docs/data/how-to/hip_runtime_api/stream_management.drawio
new file mode 100644
index 0000000000..2b443fe3f0
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/stream_management.drawio
@@ -0,0 +1,46 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/how-to/hip_runtime_api/stream_management.svg b/docs/data/how-to/hip_runtime_api/stream_management.svg
new file mode 100644
index 0000000000..c7a05657f1
--- /dev/null
+++ b/docs/data/how-to/hip_runtime_api/stream_management.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/data/unified_memory/um.drawio b/docs/data/unified_memory/um.drawio
deleted file mode 100644
index fac74f4b60..0000000000
--- a/docs/data/unified_memory/um.drawio
+++ /dev/null
@@ -1,1878 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/docs/data/unified_memory/um.svg b/docs/data/unified_memory/um.svg
deleted file mode 100644
index 748949b271..0000000000
--- a/docs/data/unified_memory/um.svg
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
\ No newline at end of file
diff --git a/docs/data/what_is_hip/hip.drawio b/docs/data/what_is_hip/hip.drawio
new file mode 100644
index 0000000000..1a47e4b097
--- /dev/null
+++ b/docs/data/what_is_hip/hip.drawio
@@ -0,0 +1,157 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/data/what_is_hip/hip.svg b/docs/data/what_is_hip/hip.svg
new file mode 100644
index 0000000000..c151dc8717
--- /dev/null
+++ b/docs/data/what_is_hip/hip.svg
@@ -0,0 +1,2 @@
+
\ No newline at end of file
diff --git a/docs/faq.rst b/docs/faq.rst
new file mode 100644
index 0000000000..76fadf658d
--- /dev/null
+++ b/docs/faq.rst
@@ -0,0 +1,241 @@
+.. meta::
+ :description: This page lists frequently asked questions about HIP
+ :keywords: AMD, ROCm, HIP, FAQ, frequently asked questions
+
+*******************************************************************************
+Frequently asked questions
+*******************************************************************************
+
+This topic provides answers to frequently asked questions from new HIP users and
+users familiar with NVIDIA CUDA.
+
+HIP Support
+===========
+
+What hardware does HIP support?
+-------------------------------
+
+HIP supports AMD and NVIDIA GPUs. See
+:ref:`prerequisites of the install guide` for detailed
+information.
+
+What operating systems does HIP support?
+----------------------------------------
+
+Linux as well as Windows are supported by ROCm. The exact versions are listed in
+the system requirements for :ref:`rocm-install-on-linux:supported_distributions`
+and :ref:`rocm-install-on-windows:supported-skus-win`.
+
+.. note::
+ Not all HIP runtime API functions are yet supported on Windows.
+ A note is added to those functions' documentation in the
+ :ref:`HIP runtime API reference`.
+
+What libraries does HIP provide?
+--------------------------------
+
+HIP provides key math and AI libraries. See :doc:`rocm:reference/api-libraries`
+for the full list.
+
+What NVIDIA CUDA features does HIP support?
+-------------------------------------------
+
+The :doc:`NVIDIA CUDA runtime API supported by HIP`
+and :doc:`NVIDIA CUDA driver API supported by HIP`
+pages describe which NVIDIA CUDA APIs are supported and what the equivalents are.
+The :doc:`HIP API documentation ` describes each API and
+its limitations, if any, compared with the equivalent CUDA API.
+
+The kernel language features are documented in the
+:doc:`/reference/cpp_language_extensions` page.
+
+Relation to other GPGPU frameworks
+==================================
+
+Is HIP a drop-in replacement for CUDA?
+--------------------------------------
+
+The `HIPIFY `_ tools can automatically convert
+almost all CUDA runtime code to HIP. Most device code needs no additional
+conversion because HIP and CUDA have the same signatures for math and built-in
+functions except for the name. HIP code provides similar performance as native
+CUDA code on NVIDIA platforms, plus the benefits of being compilable for AMD
+platforms.
+
+Additional porting might be required to deal with architecture feature
+queries or CUDA capabilities that HIP doesn't support.
+
+How does HIP compare with OpenCL?
+---------------------------------
+
+HIP offers several benefits over OpenCL:
+
+* Device code can be written in modern C++, including templates, lambdas,
+ classes and so on.
+* Host and device code can be mixed in the source files.
+* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
+* Porting from CUDA to HIP is significantly easier than from CUDA to OpenCL.
+* HIP uses development tools specialized for each platform: :doc:`amdclang++ `
+ for AMD GPUs or `nvcc `_
+ for NVIDIA GPUs, and profilers like :doc:`omniperf ` or
+ `Nsight Systems `_.
+* HIP provides
+ * pointers and host-side pointer arithmetic.
+ * device-level control over memory allocation and placement.
+ * an offline compilation model.
+
+How does porting CUDA to HIP compare to porting CUDA to OpenCL?
+---------------------------------------------------------------
+
+OpenCL differs from HIP and CUDA when considering the host runtime,
+but even more so when considering the kernel code.
+The HIP device code is a C++ dialect, while OpenCL is C99-based.
+OpenCL does not support single-source compilation.
+
+As a result, the OpenCL syntax differs significantly from HIP, and porting tools
+must perform complex transformations, especially regarding templates or other
+C++ features in kernels.
+
+To better understand the syntax differences, see :doc:`here` or
+the :doc:`HIP porting guide `.
+
+Can I install CUDA and ROCm on the same machine?
+------------------------------------------------
+
+Yes, but you require a compatible GPU to run the compiled code.
+
+On NVIDIA platforms, can I mix HIP code with CUDA code?
+-------------------------------------------------------
+
+Yes. Most HIP types and data structures are ``typedef`` s to CUDA equivalents and
+can be used interchangeably. This can be useful for iteratively porting CUDA code.
+
+See :doc:`how-to/hip_porting_guide` for more details.
+
+Can a HIP binary run on both AMD and NVIDIA platforms?
+------------------------------------------------------
+
+HIP is a source-portable language that can be compiled to run on AMD or NVIDIA
+platforms. However, the HIP tools don't create a "fat binary" that can run on
+both platforms.
+
+Compiler related questions
+==========================
+
+hipcc detected my platform incorrectly. What should I do?
+---------------------------------------------------------
+
+The environment variable ``HIP_PLATFORM`` can be used to specify the platform
+for which the code is going to be compiled with ``hipcc``. See the
+:doc:`hipcc environment variables` for more information.
+
+.. warning::
+ If you specify HIP_PLATFORM=NVIDIA with hipcc, you also need to pass ``-x cu``
+ to hipcc when compiling files with the ``.hip`` file extension. Otherwise,
+ nvcc will not recognize the ``.hip`` file extension and will fail with
+ ``nvcc fatal : Don't know what to do with .hip``.
+
+How to use HIP-Clang to build HIP programs?
+------------------------------------------------------
+
+:doc:`hipcc ` is a compiler driver. This means it is not a compiler
+but calls the appropriate compilers and sets some options.
+
+The underlying compilers are :doc:`amdclang++ ` or
+`nvcc `_,
+depending on the platform, and can be called directly.
+
+What is HIP-Clang?
+------------------
+
+HIP-Clang is a Clang/LLVM-based compiler used to compile HIP programs for AMD
+platforms. The executable is named :doc:`amdclang++ ` on
+Linux and ``clang++`` on Windows.
+
+Can I link HIP device code with host code compiled with another compiler such as gcc, icc, or clang?
+-----------------------------------------------------------------------------------------------------------
+
+Yes. HIP generates object code that conforms to the GCC ABI, and links with libstdc++.
+This means you can compile host code with the compiler of your choice and link the
+generated host object code with device code.
+
+Can HIP applications be compiled with a C compiler?
+---------------------------------------------------
+
+HIP is a C/C++ API that can be used with C compilers. However, this applies only
+to the API itself. Device code and the syntax for calling kernels must be
+compiled with a supported compiler like :doc:`hipcc `. The code
+objects that are generated with ``hipcc`` can, however, be used with a C
+compiler, as shown in the code examples below.
+
+The following is the HIP device code, assumed to be saved in ``device.hip``:
+
+.. code-block:: c++
+
+ #include
+
+ __global__ void kernel(double* array, size_t size){
+ const int x = threadIdx.x + blockIdx.x * blockDim.x;
+ if(x < size){array[x] = x;}
+ };
+
+ extern "C"{
+ hipError_t callKernel(int blocks, int threadsPerBlock, double* array, size_t size){
+ kernel<<>>(array, size);
+ return hipGetLastError();
+ }
+ }
+
+The following is the host code, written in C, saved in ``host.c``:
+
+.. code-block:: c
+
+ #include
+ #include
+ #include
+
+ #define HIP_CHECK(c) { \
+ if (c != hipSuccess){ \
+ printf("HIP Error : %s", hipGetErrorString(c)); \
+ printf(" %s %d\n", __FILE__, __LINE__); \
+ exit(c); \
+ } \
+ }
+
+ // Forward declaration - the implementation needs to be compiled with
+ // a device compiler like hipcc or amdclang++
+ hipError_t callKernel(int blocks, int threadsPerBlock, double* array, size_t size);
+
+ int main(int argc, char** argv) {
+ int blocks = 1024;
+ int threadsPerBlock = 256;
+ size_t arraySize = blocks * threadsPerBlock;
+ double* d_array;
+ double* h_array;
+ h_array = (double*)malloc(arraySize * sizeof(double));
+
+ HIP_CHECK(hipMalloc((void**)&d_array, arraySize * sizeof(double)));
+ HIP_CHECK(callKernel(blocks, threadsPerBlock, d_array, arraySize));
+ HIP_CHECK(hipMemcpy(h_array, d_array, arraySize * sizeof(double), hipMemcpyDeviceToHost));
+ HIP_CHECK(hipFree(d_array));
+
+ free(h_array);
+ return 0;
+ }
+
+These files are then compiled and linked using
+
+.. code-block:: shell
+
+ hipcc -c device.hip
+ gcc host.c device.o $(hipconfig --cpp_config) -L/opt/rocm/lib -lamdhip64
+
+assuming the default installation of ROCm in ``/opt/rocm``.
+
+How to guard code specific to the host or the GPU?
+--------------------------------------------------
+
+The compiler defines the ``__HIP_DEVICE_COMPILE__`` macro only when compiling
+device code.
+
+Refer to the :doc:`how-to/hip_porting_guide` for more information.
diff --git a/docs/how-to/debugging.rst b/docs/how-to/debugging.rst
index 2f9e6ff307..433d31de10 100644
--- a/docs/how-to/debugging.rst
+++ b/docs/how-to/debugging.rst
@@ -2,12 +2,13 @@
:description: How to debug using HIP.
:keywords: AMD, ROCm, HIP, debugging, ltrace, ROCgdb, WinGDB
+.. _debugging_with_hip:
+
*************************************************************************
Debugging with HIP
*************************************************************************
-AMD debugging tools include *ltrace* and *ROCgdb*. External tools are available and can be found
-online. For example, if you're using Windows, you can use *Microsoft Visual Studio* and *WinGDB*.
+HIP debugging tools include `ltrace `_ and :doc:`ROCgdb `. External tools are available and can be found online. For example, if you're using Windows, you can use Microsoft Visual Studio and WinGDB.
You can trace and debug your code using the following tools and techniques.
@@ -272,110 +273,7 @@ HIP environment variable summary
Here are some of the more commonly used environment variables:
-..
-
-.. # COMMENT: The following lines define a break for use in the table below.
-.. |break| raw:: html
-
-
-
-..
-
-.. list-table::
-
- * - **Environment variable**
- - **Default value**
- - **Usage**
-
- * - AMD_LOG_LEVEL
- |break| Enable HIP log on different Level
- - 0
- - 0: Disable log.
- |break| 1: Enable log on error level
- |break| 2: Enable log on warning and below levels
- |break| 0x3: Enable log on information and below levels
- |break| 0x4: Decode and display AQL packets
-
- * - AMD_LOG_MASK
- |break| Enable HIP log on different Level
- - 0x7FFFFFFF
- - 0x1: Log API calls
- |break| 0x02: Kernel and Copy Commands and Barriers
- |break| 0x4: Synchronization and waiting for commands to finish
- |break| 0x8: Enable log on information and below levels
- |break| 0x20: Queue commands and queue contents
- |break| 0x40: Signal creation, allocation, pool
- |break| 0x80: Locks and thread-safety code
- |break| 0x100: Copy debug
- |break| 0x200: Detailed copy debug
- |break| 0x400: Resource allocation, performance-impacting events
- |break| 0x800: Initialization and shutdown
- |break| 0x1000: Misc debug, not yet classified
- |break| 0x2000: Show raw bytes of AQL packet
- |break| 0x4000: Show code creation debug
- |break| 0x8000: More detailed command info, including barrier commands
- |break| 0x10000: Log message location
- |break| 0xFFFFFFFF: Log always even mask flag is zero
-
- * - HIP_LAUNCH_BLOCKING
- |break| Used for serialization on kernel execution.
- - 0
- - 0: Disable. Kernel executes normally.
- |break| 1: Enable. Serializes kernel enqueue, behaves the same as AMD_SERIALIZE_KERNEL.
-
- * - HIP_VISIBLE_DEVICES (or CUDA_VISIBLE_DEVICES)
- |break| Only devices whose index is present in the sequence are visible to HIP
- -
- - 0,1,2: Depending on the number of devices on the system
-
- * - GPU_DUMP_CODE_OBJECT
- |break| Dump code object
- - 0
- - 0: Disable
- |break| 1: Enable
-
- * - AMD_SERIALIZE_KERNEL
- |break| Serialize kernel enqueue
- - 0
- - 1: Wait for completion before enqueue
- |break| 2: Wait for completion after enqueue
- |break| 3: Both
-
- * - AMD_SERIALIZE_COPY
- |break| Serialize copies
- - 0
- - 1: Wait for completion before enqueue
- |break| 2: Wait for completion after enqueue
- |break| 3: Both
-
- * - HIP_HOST_COHERENT
- |break| Coherent memory in hipHostMalloc
- - 0
- - 0: memory is not coherent between host and GPU
- |break| 1: memory is coherent with host
-
- * - AMD_DIRECT_DISPATCH
- |break| Enable direct kernel dispatch (Currently for Linux; under development for Windows)
- - 1
- - 0: Disable
- |break| 1: Enable
-
- * - GPU_MAX_HW_QUEUES
- |break| The maximum number of hardware queues allocated per device
- - 4
- - The variable controls how many independent hardware queues HIP runtime can create per process,
- per device. If an application allocates more HIP streams than this number, then HIP runtime reuses
- the same hardware queues for the new streams in a round-robin manner. Note that this maximum
- number does not apply to hardware queues that are created for CU-masked HIP streams, or
- cooperative queues for HIP Cooperative Groups (single queue per device).
-
- * - DEBUG_HIP_7_PREVIEW
- |break| Enable preview of upcoming runtime changes that break backward compatibility.
- These changes might require updating existing application code to support the new behavior.
- The new behavior will become default in a future major release and this environment
- variable will no longer be needed.
- - 0
- - 0x1: Match the behavior of hipGetLastError with its corresponding CUDA API
+.. include:: ../how-to/debugging_env.rst
General debugging tips
======================================================
diff --git a/docs/how-to/debugging_env.rst b/docs/how-to/debugging_env.rst
new file mode 100644
index 0000000000..10b86a8adb
--- /dev/null
+++ b/docs/how-to/debugging_env.rst
@@ -0,0 +1,110 @@
+.. list-table::
+ :header-rows: 1
+ :widths: 35,14,51
+
+ * - **Environment variable**
+ - **Default value**
+ - **Value**
+
+ * - | ``AMD_LOG_LEVEL``
+ | Enables HIP log on various level.
+ - ``0``
+ - | 0: Disable log.
+ | 1: Enables error logs.
+ | 2: Enables warning logs next to lower-level logs.
+ | 3: Enables information logs next to lower-level logs.
+ | 4: Enables debug logs next to lower-level logs.
+ | 5: Enables debug extra logs next to lower-level logs.
+
+ * - | ``AMD_LOG_LEVEL_FILE``
+ | Sets output file for ``AMD_LOG_LEVEL``.
+ - stderr output
+ -
+
+ * - | ``AMD_LOG_MASK``
+ | Specifies HIP log filters. Here is the ` complete list of log masks `_.
+ - ``0x7FFFFFFF``
+ - | 0x1: Log API calls.
+ | 0x2: Kernel and copy commands and barriers.
+ | 0x4: Synchronization and waiting for commands to finish.
+ | 0x8: Decode and display AQL packets.
+ | 0x10: Queue commands and queue contents.
+ | 0x20: Signal creation, allocation, pool.
+ | 0x40: Locks and thread-safety code.
+ | 0x80: Kernel creations and arguments, etc.
+ | 0x100: Copy debug.
+ | 0x200: Detailed copy debug.
+ | 0x400: Resource allocation, performance-impacting events.
+ | 0x800: Initialization and shutdown.
+ | 0x1000: Misc debug, not yet classified.
+ | 0x2000: Show raw bytes of AQL packet.
+ | 0x4000: Show code creation debug.
+ | 0x8000: More detailed command info, including barrier commands.
+ | 0x10000: Log message location.
+ | 0x20000: Memory allocation.
+ | 0x40000: Memory pool allocation, including memory in graphs.
+ | 0x80000: Timestamp details.
+ | 0xFFFFFFFF: Log always even mask flag is zero.
+
+ * - | ``HIP_LAUNCH_BLOCKING``
+ | Used for serialization on kernel execution.
+ - ``0``
+ - | 0: Disable. Kernel executes normally.
+ | 1: Enable. Serializes kernel enqueue, behaves the same as ``AMD_SERIALIZE_KERNEL``.
+
+ * - | ``HIP_VISIBLE_DEVICES`` (or ``CUDA_VISIBLE_DEVICES``)
+ | Only devices whose index is present in the sequence are visible to HIP
+ - Unset by default.
+ - 0,1,2: Depending on the number of devices on the system.
+
+ * - | ``GPU_DUMP_CODE_OBJECT``
+ | Dump code object.
+ - ``0``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``AMD_SERIALIZE_KERNEL``
+ | Serialize kernel enqueue.
+ - ``0``
+ - | 0: Disable
+ | 1: Wait for completion before enqueue.
+ | 2: Wait for completion after enqueue.
+ | 3: Both
+
+ * - | ``AMD_SERIALIZE_COPY``
+ | Serialize copies
+ - ``0``
+ - | 0: Disable
+ | 1: Wait for completion before enqueue.
+ | 2: Wait for completion after enqueue.
+ | 3: Both
+
+ * - | ``AMD_DIRECT_DISPATCH``
+ | Enable direct kernel dispatch (Currently for Linux; under development for Windows).
+ - ``1``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``GPU_MAX_HW_QUEUES``
+ | The maximum number of hardware queues allocated per device.
+ - ``4``
+ - The variable controls how many independent hardware queues HIP runtime can create per process,
+ per device. If an application allocates more HIP streams than this number, then HIP runtime reuses
+ the same hardware queues for the new streams in a round-robin manner. Note that this maximum
+ number does not apply to hardware queues that are created for CU-masked HIP streams, or
+ cooperative queues for HIP Cooperative Groups (single queue per device).
+
+ * - | ``DEBUG_HIP_7_PREVIEW``
+ | Enable preview of upcoming
+ | runtime changes that break
+ | backward compatibility.
+ | These changes might require
+ | updating existing application
+ | code to support the new
+ | behavior. The new behavior
+ | will become default in a
+ | future major release and this
+ | environment variable will
+ | no longer be needed.
+ - 0
+ - 0x1: Match the behavior of hipGetLastError with its corresponding CUDA API
\ No newline at end of file
diff --git a/docs/how-to/faq.md b/docs/how-to/faq.md
deleted file mode 100644
index 24a638190e..0000000000
--- a/docs/how-to/faq.md
+++ /dev/null
@@ -1,386 +0,0 @@
-# Frequently asked questions
-
-## What APIs and features does HIP support?
-
-HIP provides the following:
-
-* Devices (`hipSetDevice()`, `hipGetDeviceProperties()`, etc.)
-* Memory management (`hipMalloc()`, `hipMemcpy()`, `hipFree()`, etc.)
-* Streams (`hipStreamCreate()`, `hipStreamSynchronize()`, `hipStreamWaitEvent()`, etc.)
-* Events (`hipEventRecord()`, `hipEventElapsedTime()`, etc.)
-* Kernel launching (`hipLaunchKernel`/`hipLaunchKernelGGL` is the preferred way of launching kernels. `hipLaunchKernelGGL` is a standard C/C++ macro that can serve as an alternative way to launch kernels, replacing the CUDA triple-chevron (`<<< >>>`) syntax).
-* HIP Module API to control when and how code is loaded.
-* CUDA-style kernel coordinate functions (`threadIdx`, `blockIdx`, `blockDim`, `gridDim`)
-* Cross-lane instructions including `shfl`, `ballot`, `any`, `all`
-* Most device-side math built-ins
-* Error reporting (`hipGetLastError()`, `hipGetErrorString()`)
-
-The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
-
-## What is not supported?
-
-### Runtime/Driver API features
-
-At a high-level, the following features are not supported:
-
-* Textures (partial support available)
-* Dynamic parallelism (CUDA 5.0)
-* Graphics interoperability with OpenGL or Direct3D
-* CUDA IPC Functions (Under Development)
-* CUDA array, `mipmappedArray` and pitched memory
-* Queue priority controls
-
-See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/tables/CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
-
-### Kernel language features
-
-* C++-style device-side dynamic memory allocations (free, new, delete) (CUDA 4.0)
-* Virtual functions, indirect functions and try/catch (CUDA 4.0)
-* `__prof_trigger`
-* PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly.
-* Several kernel features are under development. See the {doc}`/reference/cpp_language_extensions` for more information.
-
-## Is HIP a drop-in replacement for CUDA?
-
-No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs.
-Most developers will port their code from CUDA to HIP and then maintain the HIP version.
-HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
-
-## What specific version of CUDA does HIP support?
-
-HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of the functionality provided in CUDA, and the hipify tools can scan code to identify any unsupported CUDA functions - this is useful for identifying the specific features required by a given application.
-
-However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP. Each bullet below lists the major new language features in each CUDA release and then indicate which are supported/not supported in HIP:
-
-* CUDA 4.0 and earlier :
- * HIP supports CUDA 4.0 except for the limitations described above.
-* CUDA 5.0 :
- * Dynamic Parallelism (not supported)
- * `cuIpc` functions (under development).
-* CUDA 6.0 :
- * Managed memory (under development)
-* CUDA 6.5 :
- * `__shfl` intrinsic (supported)
-* CUDA 7.0 :
- * Per-thread default streams (supported)
- * C++11 (Hip-Clang supports all of C++11, all of C++14 and some C++17 features)
-* CUDA 7.5 :
- * float16 (supported)
-* CUDA 8.0 :
- * Page Migration including `cudaMemAdvise`, `cudaMemPrefetch`, other `cudaMem*` APIs(not supported)
-* CUDA 9.0 :
- * Cooperative Launch, Surface Object Management, Version Management
-
-## What libraries does HIP support?
-
-HIP includes growing support for the four key math libraries using hipBLAS, hipFFT, hipRAND and hipSPARSE, as well as MIOpen for machine intelligence applications.
-These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications.
-The hip interfaces support both ROCm and CUDA paths, with familiar library interfaces.
-
-* [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS), which utilizes [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS).
-* [hipFFT](https://github.com/ROCmSoftwarePlatform/hipfft)
-* [hipsSPARSE](https://github.com/ROCmSoftwarePlatform/hipsparse)
-* [hipRAND](https://github.com/ROCmSoftwarePlatform/hipRAND)
-* [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen)
-
-Additionally, some of the cuBLAS routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cuBLAS or hcBLAS depending on the platform and replace the need to use conditional compilation.
-
-## How does HIP compare with OpenCL?
-
-Both AMD and NVIDIA support OpenCL 1.2 on their devices so that developers can write portable code.
-HIP offers several benefits over OpenCL:
-
-* Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on.
-* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
-* Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL.
-* HIP uses the best available development tools on each platform: on NVIDIA GPUs, HIP code compiles using NVCC and can employ the Nsight profiler and debugger (unlike OpenCL on NVIDIA GPUs).
-* HIP provides pointers and host-side pointer arithmetic.
-* HIP provides device-level control over memory allocation and placement.
-* HIP offers an offline compilation model.
-
-## How does porting CUDA to HIP compare to porting CUDA to OpenCL?
-
-Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward.
-Both dialects support templates, classes, lambdas, and other C++ constructs.
-As one example, the hipify-perl tool was originally a Perl script that used simple text conversions from CUDA to HIP.
-HIP and CUDA provide similar math library calls as well. In summary, the HIP philosophy was to make the HIP language close enough to CUDA that the porting effort is relatively simple.
-This reduces the potential for error, and also makes it easy to automate the translation. HIP goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations.
-
-There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation.
-As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap.
-The tools also struggle with more complex CUDA applications, in particular, those that use templates, classes, or other C++ features inside the kernel.
-
-## What hardware does HIP support?
-
-* For AMD platforms, see the [ROCm documentation](https://github.com/RadeonOpenCompute/ROCm#supported-gpus) for the list of supported platforms.
-* For NVIDIA platforms, HIP requires unified memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the NVIDIA Titan and Tesla K40.
-
-## Do HIPIFY tools automatically convert all source code?
-
-Typically, HIPIFY tools can automatically convert almost all run-time code.
-Most device code needs no additional conversion since HIP and CUDA have similar names for math and built-in functions.
-The hipify-clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually).
-Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support.
-In general, developers should always expect to perform some platform-specific tuning and optimization.
-
-## What is NVCC?
-
-NVCC is NVIDIA's compiler driver for compiling "CUDA C++" code into PTX or device code for NVIDIA GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK.
-
-## What is HIP-Clang?
-
-HIP-Clang is a Clang/LLVM based compiler to compile HIP programs which can run on AMD platform.
-
-## Why use HIP rather than supporting CUDA directly?
-
-While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented.
-Developers who code to the HIP API can be assured their code will remain portable across NVIDIA and AMD platforms.
-In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit `WaveSize` which expands the return type for cross-lane functions like ballot and shuffle from 32-bit integers to 64-bit integers.
-
-## Can I develop HIP code on an NVIDIA CUDA platform?
-
-Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends.
-"Extra" APIs, parameters, and features which exist in CUDA but not in HIP-Clang will typically result in compile-time or run-time errors.
-Developers need to use the HIP API for most accelerator code and bracket any CUDA-specific code with preprocessor conditionals.
-Developers concerned about portability should, of course, run on both platforms, and should expect to tune for performance.
-In some cases, CUDA has a richer set of modes for some APIs, and some C++ capabilities such as virtual functions - see the HIP @API documentation for more details.
-
-## Can I develop HIP code on an AMD HIP-Clang platform?
-
-Yes. HIP's HIP-Clang path only exposes the APIs and functions that work on AMD runtime back ends. "Extra" APIs, parameters and features that appear in HIP-Clang but not CUDA will typically cause compile- or run-time errors. Developers must use the HIP API for most accelerator code and bracket any HIP-Clang specific code with preprocessor conditionals. Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HIP-Clang supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the HIP-Clang path.
-
-## How to use HIP-Clang to build HIP programs?
-
-The environment variable can be used to set compiler path:
-
-* HIP_CLANG_PATH: path to hip-clang. When set, this variable let hipcc to use hip-clang for compilation/linking.
-
-There is an alternative environment variable to set compiler path:
-
-* HIP_ROCCLR_HOME: path to root directory of the HIP-ROCclr runtime. When set, this variable let hipcc use hip-clang from the ROCclr distribution.
-NOTE: If HIP_ROCCLR_HOME is set, there is no need to set HIP_CLANG_PATH since hipcc will deduce them from HIP_ROCCLR_HOME.
-
-## What is AMD clr?
-
-AMD [Compute Language Runtime (CLR)](https://github.com/ROCm/clr) is a repository for the AMD platform, which contains source codes for AMD's compute languages runtimes as follows,
-
-* hipamd - contains implementation of HIP language for AMD GPU.
-* rocclr - contains virtual device interfaces that compute runtimes interact with backends, such as ROCr on Linux and PAL on Windows.
-* opencl - contains implementation of OpenCL™ on the AMD platform.
-
-## What is hipother?
-
-A new repository ['hipother'](https://github.com/ROCm/hipother) is added in the ROCm 6.1 release, which is branched out from HIP.
-hipother supports the HIP back-end implementation on some non-AMD platforms, like NVIDIA.
-
-## Can I get HIP open source repository for Windows?
-
-No, there is no HIP repository open publicly on Windows.
-
-## Can a HIP binary run on both AMD and NVIDIA platforms?
-
-HIP is a source-portable language that can be compiled to run on either AMD or NVIDIA platform. HIP tools don't create a "fat binary" that can run on either platform, however.
-
-## On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang?
-
-Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code
-with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with NVCC) and host code (compiled with gcc, icc, or clang). These projects
-can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from their preferred compiler.
-
-## Can HIP API support C style application? What is the difference between C and C++?
-
-HIP is C++ runtime API that supports C style applications as well.
-
-Some C style applications (and interfaces to other languages (FORTRAN, Python)) would call certain HIP APIs but not use kernel programming.
-They can be compiled with a C compiler and run correctly, however, small details must be considered in the code. For example, initialization, as shown in the simple application below, uses HIP structs dim3 with the file name "test.hip.cpp"
-
-```cpp
-#include "hip/hip_runtime_api.h"
-#include "stdio.h"
-
-int main(int argc, char** argv) {
- dim3 grid1;
- printf("dim3 grid1; x=%d, y=%d, z=%d\n",grid1.x,grid1.y,grid1.z);
- dim3 grid2 = {1,1,1};
- printf("dim3 grid2 = {1,1,1}; x=%d, y=%d, z=%d\n",grid2.x,grid2.y,grid2.z);
- return 0;
-}
-```
-
-When using a C++ compiler,
-
-```shell
-$ gcc -x c++ $(hipconfig --cpp_config) test3.hip.cpp -o test
-$ ./test
-dim3 grid1; x=1, y=1, z=1
-dim3 grid2 = {1,1,1}; x=1, y=1, z=1
-```
-
-In which "dim3 grid1;" will yield a dim3 grid with all dimensional members x,y,z initialized to 1, as the default constructor behaves that way.
-Further, if written:
-
-```cpp
-dim3 grid(2); // yields {2,1,1}
-dim3 grid(2,3); yields {2,3,1}
-```
-
-In comparison, when using the C compiler,
-
-```shell
-$ gcc -x c $(hipconfig --cpp_config) test.hip.cpp -o test
-$ ./test
-dim3 grid1; x=646881376, y=21975, z=1517277280
-dim3 grid2 = {1,1,1}; x=1, y=1, z=1
-```
-
-In which "dim3 grid;" does not imply any initialization, no constructor is called, and dimensional values x,y,z of grid are undefined.
-NOTE: To get the C++ default behavior, C programmers must additionally specify the right-hand side as shown below,
-
-```cpp
-dim3 grid = {1,1,1}; // initialized as in C++
-```
-
-## Can I install both CUDA SDK and HIP-Clang on the same machine?
-
-Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA.
-
-## HIP detected my platform (HIP-Clang vs NVCC) incorrectly * what should I do?
-
-HIP will set the platform to AMD and use HIP-Clang as compiler if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
-Sometimes this isn't what you want * you can force HIP to recognize the platform by setting the following,
-
-```shell
-export HIP_PLATFORM=amd
-```
-
-HIP then set and use correct AMD compiler and runtime,
-HIP_COMPILER=clang
-HIP_RUNTIME=rocclr
-
-To choose NVIDIA platform, you can set,
-
-```shell
-export HIP_PLATFORM=nvidia
-```
-
-In this case, HIP will set and use the following,
-
-```shell
-HIP_COMPILER=cuda
-HIP_RUNTIME=nvcc
-```
-
-One symptom of this problem is the message "error: 'unknown error'(11) at `square.hipref.cpp:56`. This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as NVCC. HIP may be able to compile the application using the NVCC tool-chain but will generate this error at runtime since the platform does not have a CUDA device.
-
-## On CUDA, can I mix CUDA code with HIP code?
-
-Yes. Most HIP data structures (`hipStream_t`, `hipEvent_t`) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids.
-One notable exception is that `hipError_t` is a new type, and cannot be used where a `cudaError_t` is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces:
-
-`hipErrorToCudaError`
-`hipCUDAErrorTohipError`
-`hipCUResultTohipError`
-
-If platform portability is important, use `#ifdef __HIP_PLATFORM_NVIDIA__` to guard the CUDA-specific code.
-
-## How do I trace HIP application flow?
-
-See {doc}`/how-to/logging` for more information.
-
-## What are the maximum limits of kernel launch parameters?
-
-Product of block.x, block.y, and block.z should be less than 1024.
-Please note, HIP does not support kernel launch with total work items defined in dimension with size `gridDim x blockDim >= 2^32`, so `gridDim.x * blockDim.x, gridDim.y * blockDim.y and gridDim.z * blockDim.z` are always less than 2^32.
-
-## Are ``__shfl_*_sync`` functions supported on HIP platform?
-
-``__shfl_*_sync`` is not supported on HIP but for NVCC path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
-
-## How to create a guard for code that is specific to the host or the GPU?
-
-The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU.
-
-## Why _OpenMP is undefined when compiling with `-fopenmp`?
-
-When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (for example `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language constructs you could work around this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU.
-
-## Does the HIP-Clang compiler support extern shared declarations?
-
-Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
-
-Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition:
-extern __shared__ type var[];
-
-## I have multiple HIP enabled devices and I am getting an error code `hipErrorSharedObjectInitFailed` with the message "Error: shared object initialization failed"?
-
-This error message is seen due to the fact that you do not have valid code object for all of your devices.
-
-If you have compiled the application yourself, make sure you have given the correct device name(s) and its features via: `--offload-arch`. If you are not mentioning the `--offload-arch`, make sure that `hipcc` is using the correct offload arch by verifying the hipcc output generated by setting the environment variable `HIPCC_VERBOSE=1`.
-
-If you have a precompiled application/library (like rocblas, TensorFlow etc) which gives you such error, there are one of two possibilities.
-
-* The application/library does not ship code object bundles for __all__ of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
-* The application/library does not ship code object bundles for __some__ of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` or `CUDA_VISIBLE_DEVICES` on NVIDIA platform, to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
-
-Note: In previous releases, the error code is `hipErrorNoBinaryForGpu` with message "Unable to find code object for all current devices".
-The error code handling behavior is changed. HIP runtime shows the error code `hipErrorSharedObjectInitFailed` with message "Error: shared object initialization failed" on unsupported GPU.
-
-## How to use per-thread default stream in HIP?
-
-The per-thread default stream is an implicit stream local to both the thread and the current device. It does not do any implicit synchronization with other streams (like explicitly created streams), or default per-thread stream on other threads.
-
-The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
-
-In ROCm, a compilation option should be added in order to compile the translation unit with per-thread default stream enabled.
-`-fgpu-default-stream=per-thread`.
-Once source is compiled with per-thread default stream enabled, all APIs will be executed on per thread default stream, hence there will not be any implicit synchronization with other streams.
-
-Besides, per-thread default stream be enabled per translation unit, users can compile some files with feature enabled and some with feature disabled. Feature enabled translation unit will have default stream as per thread and there will not be any implicit synchronization done but other modules will have legacy default stream which will do implicit synchronization.
-
-## How to use complex multiplication and division operations?
-
-In HIP, `hipFloatComplex` and `hipDoubleComplex` are defined as complex data types,
-
-```c++
-typedef float2 hipFloatComplex;
-typedef double2 hipDoubleComplex;
-```
-
-Any application uses complex multiplication and division operations, need to replace '*' and '/' operators with the following,
-
-* `hipCmulf()` and `hipCdivf()` for `hipFloatComplex`
-* `hipCmul()` and `hipCdiv()` for `hipDoubleComplex`
-
-Note: These complex operations are equivalent to corresponding types/functions on the NVIDIA platform.
-
-## Can I develop applications with HIP APIs on Windows the same on Linux?
-
-Yes, HIP APIs are available to use on both Linux and Windows.
-Due to different working mechanisms on operating systems like Windows vs Linux, HIP APIs call corresponding lower level backend runtime libraries and kernel drivers for the OS, in order to control the executions on GPU hardware accordingly. There might be a few differences on the related backend software and driver support, which might affect usage of HIP APIs. See OS support details in HIP API document.
-
-## Does HIP support LUID?
-
-Starting ROCm 6.0, HIP runtime supports Locally Unique Identifier (LUID).
-This feature enables the local physical device(s) to interoperate with other devices. For example, DirectX 12.
-
-HIP runtime sets device LUID properties so the driver can query LUID to identify each device for interoperability.
-
-Note: HIP supports LUID only on Windows OS.
-
-## How can I know the version of HIP?
-
-HIP version definition has been updated since ROCm 4.2 release as the following:
-
-```cpp
-HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH
-```
-
-HIP version can be queried from HIP API call,
-
-```cpp
-hipRuntimeGetVersion(&runtimeVersion);
-```
-
-The version returned will always be greater than the versions in previous ROCm releases.
-
-Note: The version definition of HIP runtime is different from CUDA. On AMD platform, the function returns HIP runtime version, while on NVIDIA platform, it returns CUDA runtime version. And there is no mapping/correlation between HIP version and CUDA version.
diff --git a/docs/how-to/hip_porting_guide.md b/docs/how-to/hip_porting_guide.md
index b052dcda79..bc3a2deda9 100644
--- a/docs/how-to/hip_porting_guide.md
+++ b/docs/how-to/hip_porting_guide.md
@@ -111,10 +111,10 @@ Most CUDA libraries have a corresponding ROCm library with similar functionality
All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking.
-* `HIP_PLATFORM_AMD` is defined if the HIP platform targets AMD.
-Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD, it is deprecated.
-* `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA.
-Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
+* `__HIP_PLATFORM_AMD__` is defined if the HIP platform targets AMD.
+Note, `__HIP_PLATFORM_HCC__` was previously defined if the HIP platform targeted AMD, it is deprecated.
+* `__HIP_PLATFORM_NVDIA__` is defined if the HIP platform targets NVIDIA.
+Note, `__HIP_PLATFORM_NVCC__` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
### Identifying the Compiler: hip-clang or NVCC
@@ -257,7 +257,14 @@ ROCclr is a virtual device interface that HIP runtimes interact with different b
* NVIDIA platform
On NVIDIA platform, HIP is just a thin layer on top of CUDA.
-On non-AMD platform, HIP runtime determines if CUDA is available and can be used. If available, HIP_PLATFORM is set to `nvidia` and underneath CUDA path is used.
+
+The environment variable `HIP_PLATFORM` specifies the runtime to use. The
+platform is detected automatically by HIP. When an AMD graphics driver and an
+AMD GPU is detected, `HIP_PLATFORM` is set to `amd`. If both runtimes are
+installed, and a specific one should be used, or HIP can't detect the runtime,
+setting the environment variable manually tells `hipcc` what compilation path to
+choose. To use the CUDA compilation path, set the environment variable to
+`HIP_PLATFORM=nvidia`.
## `hipLaunchKernelGGL`
diff --git a/docs/how-to/hip_rtc.md b/docs/how-to/hip_rtc.md
deleted file mode 100644
index b96c069cb2..0000000000
--- a/docs/how-to/hip_rtc.md
+++ /dev/null
@@ -1,511 +0,0 @@
-# Programming for HIP runtime compiler (RTC)
-
-HIP lets you compile kernels at runtime with the `hiprtc*` APIs.
-Kernels can be stored as a text string and can be passed to HIPRTC APIs alongside options to guide the compilation.
-
-NOTE:
-
-* This library can be used on systems without HIP installed nor AMD GPU driver installed at all (offline compilation). Therefore, it does not depend on any HIP runtime library.
-* But it does depend on Code Object Manager (comgr). You may try to statically link comgr into HIPRTC to avoid any ambiguity.
-* Developers can decide to bundle this library with their application.
-
-## Compilation APIs
-
-To use HIPRTC functionality, HIPRTC header needs to be included first.
-`#include `
-
-Kernels can be stored in a string:
-
-```cpp
-static constexpr auto kernel_source {
-R"(
- extern "C"
- __global__ void vector_add(float* output, float* input1, float* input2, size_t size) {
- int i = threadIdx.x;
- if (i < size) {
- output[i] = input1[i] + input2[i];
- }
- }
-)"};
-```
-
-Now to compile this kernel, it needs to be associated with `hiprtcProgram` type, which is done by declaring `hiprtcProgram prog;` and associating the string of kernel with this program:
-
-```cpp
-hiprtcCreateProgram(&prog, // HIPRTC program handle
- kernel_source, // HIP kernel source string
- "vector_add.cpp", // Name of the HIP program, can be null or an empty string
- 0, // Number of headers
- NULL, // Header sources
- NULL); // Name of header files
-```
-
-`hiprtcCreateProgram` API also allows you to add headers which can be included in your RTC program.
-For online compilation, the compiler pre-defines HIP device API functions, HIP specific types and macros for device compilation, but does not include standard C/C++ headers by default. Users can only include header files provided to `hiprtcCreateProgram`.
-
-After associating the kernel string with `hiprtcProgram`, you can now compile this program using:
-
-```cpp
-hiprtcCompileProgram(prog, // hiprtcProgram
- 0, // Number of options
- options); // Clang Options [Supported Clang Options](clang_options.md)
-```
-
-`hiprtcCompileProgram` returns a status value which can be converted to string via `hiprtcGetErrorString`. If compilation is successful, `hiprtcCompileProgram` will return `HIPRTC_SUCCESS`.
-
-If the compilation fails, you can look up the logs via:
-
-```cpp
-size_t logSize;
-hiprtcGetProgramLogSize(prog, &logSize);
-
-if (logSize) {
- string log(logSize, '\0');
- hiprtcGetProgramLog(prog, &log[0]);
- // Corrective action with logs
-}
-```
-
-If the compilation is successful, you can load the compiled binary in a local variable.
-
-```cpp
-size_t codeSize;
-hiprtcGetCodeSize(prog, &codeSize);
-
-vector kernel_binary(codeSize);
-hiprtcGetCode(prog, kernel_binary.data());
-```
-
-After loading the binary, `hiprtcProgram` can be destroyed.
-`hiprtcDestroyProgram(&prog);`
-
-The binary present in `kernel_binary` can now be loaded via `hipModuleLoadData` API.
-
-```cpp
-hipModule_t module;
-hipFunction_t kernel;
-
-hipModuleLoadData(&module, kernel_binary.data());
-hipModuleGetFunction(&kernel, module, "vector_add");
-```
-
-And now this kernel can be launched via `hipModule` APIs.
-
-The full example is below:
-
-```cpp
-#include
-#include
-
-#include
-#include
-#include
-
-#define CHECK_RET_CODE(call, ret_code) \
- { \
- if ((call) != ret_code) { \
- std::cout << "Failed in call: " << #call << std::endl; \
- std::abort(); \
- } \
- }
-#define HIP_CHECK(call) CHECK_RET_CODE(call, hipSuccess)
-#define HIPRTC_CHECK(call) CHECK_RET_CODE(call, HIPRTC_SUCCESS)
-
-// source code for hiprtc
-static constexpr auto kernel_source{
- R"(
- extern "C"
- __global__ void vector_add(float* output, float* input1, float* input2, size_t size) {
- int i = threadIdx.x;
- if (i < size) {
- output[i] = input1[i] + input2[i];
- }
- }
-)"};
-
-int main() {
- hiprtcProgram prog;
- auto rtc_ret_code = hiprtcCreateProgram(&prog, // HIPRTC program handle
- kernel_source, // kernel source string
- "vector_add.cpp", // Name of the file
- 0, // Number of headers
- NULL, // Header sources
- NULL); // Name of header file
-
- if (rtc_ret_code != HIPRTC_SUCCESS) {
- std::cout << "Failed to create program" << std::endl;
- std::abort();
- }
-
- hipDeviceProp_t props;
- int device = 0;
- HIP_CHECK(hipGetDeviceProperties(&props, device));
- std::string sarg = std::string("--gpu-architecture=") +
- props.gcnArchName; // device for which binary is to be generated
-
- const char* options[] = {sarg.c_str()};
-
- rtc_ret_code = hiprtcCompileProgram(prog, // hiprtcProgram
- 0, // Number of options
- options); // Clang Options
- if (rtc_ret_code != HIPRTC_SUCCESS) {
- std::cout << "Failed to create program" << std::endl;
- std::abort();
- }
-
- size_t logSize;
- HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize));
-
- if (logSize) {
- std::string log(logSize, '\0');
- HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0]));
- std::cout << "Compilation failed with: " << log << std::endl;
- std::abort();
- }
-
- size_t codeSize;
- HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize));
-
- std::vector kernel_binary(codeSize);
- HIPRTC_CHECK(hiprtcGetCode(prog, kernel_binary.data()));
-
- HIPRTC_CHECK(hiprtcDestroyProgram(&prog));
-
- hipModule_t module;
- hipFunction_t kernel;
-
- HIP_CHECK(hipModuleLoadData(&module, kernel_binary.data()));
- HIP_CHECK(hipModuleGetFunction(&kernel, module, "vector_add"));
-
- constexpr size_t ele_size = 256; // total number of items to add
- std::vector hinput, output;
- hinput.reserve(ele_size);
- output.reserve(ele_size);
- for (size_t i = 0; i < ele_size; i++) {
- hinput.push_back(static_cast(i + 1));
- output.push_back(0.0f);
- }
-
- float *dinput1, *dinput2, *doutput;
- HIP_CHECK(hipMalloc(&dinput1, sizeof(float) * ele_size));
- HIP_CHECK(hipMalloc(&dinput2, sizeof(float) * ele_size));
- HIP_CHECK(hipMalloc(&doutput, sizeof(float) * ele_size));
-
- HIP_CHECK(hipMemcpy(dinput1, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice));
- HIP_CHECK(hipMemcpy(dinput2, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice));
-
- struct {
- float* output;
- float* input1;
- float* input2;
- size_t size;
- } args{doutput, dinput1, dinput2, ele_size};
-
- auto size = sizeof(args);
- void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
- HIP_LAUNCH_PARAM_END};
-
- HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, ele_size, 1, 1, 0, nullptr, nullptr, config));
-
- HIP_CHECK(hipMemcpy(output.data(), doutput, sizeof(float) * ele_size, hipMemcpyDeviceToHost));
-
- for (size_t i = 0; i < ele_size; i++) {
- if ((hinput[i] + hinput[i]) != output[i]) {
- std::cout << "Failed in validation: " << (hinput[i] + hinput[i]) << " - " << output[i]
- << std::endl;
- std::abort();
- }
- }
- std::cout << "Passed" << std::endl;
-
- HIP_CHECK(hipFree(dinput1));
- HIP_CHECK(hipFree(dinput2));
- HIP_CHECK(hipFree(doutput));
-}
-```
-
-## HIPRTC specific options
-
-HIPRTC provides a few HIPRTC specific flags
-
-* `--gpu-architecture` : This flag can guide the code object generation for a specific gpu arch. Example: `--gpu-architecture=gfx906:sramecc+:xnack-`, its equivalent to `--offload-arch`.
- * This option is compulsory if compilation is done on a system without AMD GPUs supported by HIP runtime.
- * Otherwise, HIPRTC will load the hip runtime and gather the current device and its architecture info and use it as option.
-* `-fgpu-rdc` : This flag when provided during the `hiprtcCompileProgram` generates the bitcode (HIPRTC doesn't convert this bitcode into ISA and binary). This bitcode can later be fetched using `hiprtcGetBitcode` and `hiprtcGetBitcodeSize` APIs.
-
-### Bitcode
-
-In the usual scenario, the kernel associated with `hiprtcProgram` is compiled into the binary which can be loaded and run. However, if `-fpu-rdc` option is provided in the compile options, HIPRTC calls comgr and generates only the LLVM bitcode. It doesn't convert this bitcode to ISA and generate the final binary.
-
-```cpp
-std::string sarg = std::string("-fgpu-rdc");
-const char* options[] = {
- sarg.c_str() };
-hiprtcCompileProgram(prog, // hiprtcProgram
- 1, // Number of options
- options);
-```
-
-If the compilation is successful, one can load the bitcode in a local variable using the bitcode APIs provided by HIPRTC.
-
-```cpp
-size_t bitCodeSize;
-hiprtcGetBitcodeSize(prog, &bitCodeSize);
-
-vector kernel_bitcode(bitCodeSize);
-hiprtcGetBitcode(prog, kernel_bitcode.data());
-```
-
-### CU Mode vs WGP mode
-
-AMD GPUs consist of an array of workgroup processors, each built with 2 compute units (CUs) capable of executing SIMD32. All the CUs inside a workgroup processor use local data share (LDS).
-
-gfx10+ support execution of wavefront in CU mode and work-group processor mode (WGP). Please refer to section 2.3 of [RDNA3 ISA reference](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf).
-
-gfx9 and below only supports CU mode.
-
-In WGP mode, 4 warps of a block can simultaneously be executed on the workgroup processor, where as in CU mode only 2 warps of a block can simultaneously execute on a CU. In theory, WGP mode might help with occupancy and increase the performance of certain HIP programs (if not bound to inter warp communication), but might incur performance penalty on other HIP programs which rely on atomics and inter warp communication. This also has effect of how the LDS is split between warps, please refer to [RDNA3 ISA reference](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf) for more information.
-
-HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by passing `-mcumode` to HIPRTC compile options in `hiprtcCompileProgram`.
-
-## Linker APIs
-
-The bitcode generated using the HIPRTC Bitcode APIs can be loaded using `hipModule` APIs and also can be linked with other generated bitcodes with appropriate linker flags using the HIPRTC linker APIs. This also provides more flexibility and optimizations to the applications who want to generate the binary dynamically according to their needs. The input bitcodes can be generated only for a specific architecture or it can be a bundled bitcode which is generated for multiple architectures.
-
-### Example
-
-Firstly, HIPRTC link instance or a pending linker invocation must be created using `hiprtcLinkCreate`, with the appropriate linker options provided.
-
-```cpp
-hiprtcLinkCreate( num_options, // number of options
- options, // Array of options
- option_vals, // Array of option values cast to void*
- &rtc_link_state ); // HIPRTC link state created upon success
-```
-
-Following which, the bitcode data can be added to this link instance via `hiprtcLinkAddData` (if the data is present as a string) or `hiprtcLinkAddFile` (if the data is present as a file) with the appropriate input type according to the data or the bitcode used.
-
-```cpp
-hiprtcLinkAddData(rtc_link_state, // HIPRTC link state
- input_type, // type of the input data or bitcode
- bit_code_ptr, // input data which is null terminated
- bit_code_size, // size of the input data
- "a", // optional name for this input
- 0, // size of the options
- 0, // Array of options applied to this input
- 0); // Array of option values cast to void*
-```
-
-```cpp
-hiprtcLinkAddFile(rtc_link_state, // HIPRTC link state
- input_type, // type of the input data or bitcode
- bc_file_path.c_str(), // path to the input file where bitcode is present
- 0, // size of the options
- 0, // Array of options applied to this input
- 0); // Array of option values cast to void*
-```
-
-Once the bitcodes for multiple architectures are added to the link instance, the linking of the device code must be completed using `hiprtcLinkComplete` which generates the final binary.
-
-```cpp
-hiprtcLinkComplete(rtc_link_state, // HIPRTC link state
- &binary, // upon success, points to the output binary
- &binarySize); // size of the binary is stored (optional)
-```
-
-If the `hiprtcLinkComplete` returns successfully, the generated binary can be loaded and run using the `hipModule*` APIs.
-
-```cpp
-hipModuleLoadData(&module, binary);
-```
-
-#### Note
-
-* The compiled binary must be loaded before HIPRTC link instance is destroyed using the `hiprtcLinkDestroy` API.
-
-```cpp
-hiprtcLinkDestroy(rtc_link_state);
-```
-
-* The correct sequence of calls is : `hiprtcLinkCreate`, `hiprtcLinkAddData` or `hiprtcLinkAddFile`, `hiprtcLinkComplete`, `hiprtcModuleLoadData`, `hiprtcLinkDestroy`.
-
-### Input Types
-
-HIPRTC provides `hiprtcJITInputType` enumeration type which defines the input types accepted by the Linker APIs. Here are the `enum` values of `hiprtcJITInputType`. However only the input types `HIPRTC_JIT_INPUT_LLVM_BITCODE`, `HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE` and `HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE` are supported currently.
-
-`HIPRTC_JIT_INPUT_LLVM_BITCODE` can be used to load both LLVM bitcode or LLVM IR assembly code. However, `HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE` and `HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE` are only for bundled bitcode and archive of bundled bitcode.
-
-```cpp
-HIPRTC_JIT_INPUT_CUBIN = 0,
-HIPRTC_JIT_INPUT_PTX,
-HIPRTC_JIT_INPUT_FATBINARY,
-HIPRTC_JIT_INPUT_OBJECT,
-HIPRTC_JIT_INPUT_LIBRARY,
-HIPRTC_JIT_INPUT_NVVM,
-HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES,
-HIPRTC_JIT_INPUT_LLVM_BITCODE = 100,
-HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101,
-HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102,
-HIPRTC_JIT_NUM_INPUT_TYPES = (HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES + 3)
-```
-
-### Backward Compatibility of LLVM Bitcode/IR
-
-For HIP applications utilizing HIPRTC to compile LLVM bitcode/IR, compatibility is assured only when the ROCm or HIP SDK version used for generating the LLVM bitcode/IR matches the version used during the runtime compilation. When an application requires the ingestion of bitcode/IR not derived from the currently installed AMD compiler, it must run with HIPRTC and comgr dynamic libraries that are compatible with the version of the bitcode/IR.
-
-comgr, a shared library, incorporates the LLVM/Clang compiler that HIPRTC relies on. To identify the bitcode/IR version that comgr is compatible with, one can execute "clang -v" using the clang binary from the same ROCm or HIP SDK package. For instance, if compiling bitcode/IR version 14, the HIPRTC and comgr libraries released by AMD around mid 2022 would be the best choice, assuming the LLVM/Clang version included in the package is also version 14.
-
-To ensure smooth operation and compatibility, an application may choose to ship the specific versions of HIPRTC and comgr dynamic libraries, or it may opt to clearly specify the version requirements and dependencies. This approach guarantees that the application can correctly compile the specified version of bitcode/IR.
-
-### Link Options
-
-* `HIPRTC_JIT_IR_TO_ISA_OPT_EXT` - AMD Only. Options to be passed on to link step of compiler by `hiprtcLinkCreate`.
-* `HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT` - AMD Only. Count of options passed on to link step of compiler.
-
-Example:
-
-```cpp
-const char* isaopts[] = {"-mllvm", "-inline-threshold=1", "-mllvm", "-inlinehint-threshold=1"};
-std::vector jit_options = {HIPRTC_JIT_IR_TO_ISA_OPT_EXT,
- HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
-size_t isaoptssize = 4;
-const void* lopts[] = {(void*)isaopts, (void*)(isaoptssize)};
-hiprtcLinkState linkstate;
-hiprtcLinkCreate(2, jit_options.data(), (void**)lopts, &linkstate);
-```
-
-## Error Handling
-
-HIPRTC defines the `hiprtcResult` enumeration type and a function `hiprtcGetErrorString` for API call error handling. `hiprtcResult` `enum` defines the API result codes. HIPRTC APIs return `hiprtcResult` to indicate the call result. `hiprtcGetErrorString` function returns a string describing the given `hiprtcResult` code, e.g., HIPRTC_SUCCESS to "HIPRTC_SUCCESS". For unrecognized enumeration values, it returns "Invalid HIPRTC error code".
-
-`hiprtcResult` `enum` supported values and the `hiprtcGetErrorString` usage are mentioned below.
-
-```cpp
-HIPRTC_SUCCESS = 0,
-HIPRTC_ERROR_OUT_OF_MEMORY = 1,
-HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
-HIPRTC_ERROR_INVALID_INPUT = 3,
-HIPRTC_ERROR_INVALID_PROGRAM = 4,
-HIPRTC_ERROR_INVALID_OPTION = 5,
-HIPRTC_ERROR_COMPILATION = 6,
-HIPRTC_ERROR_LINKING = 7,
-HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 8,
-HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 9,
-HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 10,
-HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 11,
-HIPRTC_ERROR_INTERNAL_ERROR = 12
-```
-
-```cpp
-hiprtcResult result;
-result = hiprtcCompileProgram(prog, 1, opts);
-if (result != HIPRTC_SUCCESS) {
-std::cout << "hiprtcCompileProgram fails with error " << hiprtcGetErrorString(result);
-}
-```
-
-## HIPRTC General APIs
-
-HIPRTC provides the following API for querying the version.
-
-`hiprtcVersion(int* major, int* minor)` - This sets the output parameters major and minor with the HIP Runtime compilation major version and minor version number respectively.
-
-Currently, it returns hardcoded value. This should be implemented to return HIP runtime major and minor version in the future releases.
-
-## Lowered Names (Mangled Names)
-
-HIPRTC mangles the `__global__` function names and names of `__device__` and `__constant__` variables. If the generated binary is being loaded using the HIP Runtime API, the kernel function or `__device__/__constant__` variable must be looked up by name, but this is very hard when the name has been mangled. To overcome this, HIPRTC provides API functions that map `__global__` function or `__device__/__constant__` variable names in the source to the mangled names present in the generated binary.
-
-The two APIs `hiprtcAddNameExpression` and `hiprtcGetLoweredName` provide this functionality. First, a 'name expression' string denoting the address for the `__global__` function or `__device__/__constant__` variable is provided to `hiprtcAddNameExpression`. Then, the program is compiled with `hiprtcCompileProgram`. During compilation, HIPRTC will parse the name expression string as a C++ constant expression at the end of the user program. Finally, the function `hiprtcGetLoweredName` is called with the original name expression and it returns a pointer to the lowered name. The lowered name can be used to refer to the kernel or variable in the HIP Runtime API.
-
-### Note
-
-* The identical name expression string must be provided on a subsequent call to `hiprtcGetLoweredName` to extract the lowered name.
-* The correct sequence of calls is : `hiprtcAddNameExpression`, `hiprtcCompileProgram`, `hiprtcGetLoweredName`, `hiprtcDestroyProgram`.
-* The lowered names must be fetched using `hiprtcGetLoweredName` only after the HIPRTC program has been compiled, and before it has been destroyed.
-
-### Example
-
-kernel containing various definitions `__global__` functions/function templates and `__device__/__constant__` variables can be stored in a string.
-
-```cpp
-static constexpr const char gpu_program[] {
-R"(
-__device__ int V1; // set from host code
-static __global__ void f1(int *result) { *result = V1 + 10; }
-namespace N1 {
-namespace N2 {
-__constant__ int V2; // set from host code
-__global__ void f2(int *result) { *result = V2 + 20; }
-}
-}
-template
-__global__ void f3(int *result) { *result = sizeof(T); }
-)"};
-```
-
-`hiprtcAddNameExpression` is called with various name expressions referring to the address of `__global__` functions and `__device__/__constant__` variables.
-
-```cpp
-kernel_name_vec.push_back("&f1");
-kernel_name_vec.push_back("N1::N2::f2");
-kernel_name_vec.push_back("f3");
-for (auto&& x : kernel_name_vec) hiprtcAddNameExpression(prog, x.c_str());
-variable_name_vec.push_back("&V1");
-variable_name_vec.push_back("&N1::N2::V2");
-for (auto&& x : variable_name_vec) hiprtcAddNameExpression(prog, x.c_str());
-```
-
-After which, the program is compiled using `hiprtcCompileProgram` and the generated binary is loaded using `hipModuleLoadData`. And the mangled names can be fetched using `hirtcGetLoweredName`.
-
-```cpp
-for (decltype(variable_name_vec.size()) i = 0; i != variable_name_vec.size(); ++i) {
- const char* name;
- hiprtcGetLoweredName(prog, variable_name_vec[i].c_str(), &name);
-}
-```
-
-```cpp
-for (decltype(kernel_name_vec.size()) i = 0; i != kernel_name_vec.size(); ++i) {
- const char* name;
- hiprtcGetLoweredName(prog, kernel_name_vec[i].c_str(), &name);
-}
-```
-
-The mangled name of the variables are used to look up the variable in the module and update its value.
-
-```cpp
-hipDeviceptr_t variable_addr;
-size_t bytes{};
-hipModuleGetGlobal(&variable_addr, &bytes, module, name);
-hipMemcpyHtoD(variable_addr, &initial_value, sizeof(initial_value));
-```
-
-Finally, the mangled name of the kernel is used to launch it using the `hipModule` APIs.
-
-```cpp
-hipFunction_t kernel;
-hipModuleGetFunction(&kernel, module, name);
-hipModuleLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, nullptr, nullptr, config);
-```
-
-Please have a look at `hiprtcGetLoweredName.cpp` for the detailed example.
-
-## Versioning
-
-HIPRTC follows the below versioning.
-
-* Linux
- * HIPRTC follows the same versioning as HIP runtime library.
- * The `so` name field for the shared library is set to MAJOR version. For example, for HIP 5.3 the `so` name is set to 5 (`hiprtc.so.5`).
-* Windows
- * HIPRTC dll is named as `hiprtcXXYY.dll` where XX is MAJOR version and YY is MINOR version. For example, for HIP 5.3 the name is `hiprtc0503.dll`.
-
-## HIP header support
-
-* Added HIPRTC support for all the hip common header files such as library_types.h, hip_math_constants.h, hip_complex.h, math_functions.h, surface_types.h etc. from 6.1. HIPRTC users need not include any HIP macros or constants explicitly in their header files. All of these should get included via HIPRTC builtins when the app links to HIPRTC library.
-
-## Deprecation notice
-
-* Currently HIPRTC APIs are separated from HIP APIs and HIPRTC is available as a separate library `libhiprtc.so`/`libhiprtc.dll`. But on Linux, HIPRTC symbols are also present in `libamdhip64.so` in order to support the existing applications. Gradually, these symbols will be removed from HIP library and applications using HIPRTC will be required to explicitly link to HIPRTC library. However, on Windows `hiprtc.dll` must be used as the `amdhip64.dll` doesn't contain the HIPRTC symbols.
-* Data types such as `uint32_t`, `uint64_t`, `int32_t`, `int64_t` defined in std namespace in HIPRTC are deprecated earlier and are being removed from ROCm release 6.1 since these can conflict with the standard C++ data types. These data types are now prefixed with `__hip__`, e.g. `__hip_uint32_t`. Applications previously using `std::uint32_t` or similar types can use `__hip_` prefixed types to avoid conflicts with standard std namespace or application can have their own definitions for these types. Also, type_traits templates previously defined in std namespace are moved to `__hip_internal` namespace as implementation details.
diff --git a/docs/how-to/hip_rtc.rst b/docs/how-to/hip_rtc.rst
new file mode 100644
index 0000000000..b37d21c491
--- /dev/null
+++ b/docs/how-to/hip_rtc.rst
@@ -0,0 +1,541 @@
+.. meta::
+ :description: HIP runtime compiler (RTC)
+ :keywords: AMD, ROCm, HIP, CUDA, RTC, HIP runtime compiler
+
+.. _hip_runtime_compiler_how-to:
+
+*******************************************************************************
+Programming for HIP runtime compiler (RTC)
+*******************************************************************************
+
+HIP lets you compile kernels at runtime with the ``hiprtc*`` APIs.
+Kernels can be stored as a text string and can be passed to HIPRTC APIs alongside options to guide the compilation.
+
+.. note::
+
+ * This library can be used on systems without HIP installed nor AMD GPU driver installed at all (offline compilation). Therefore, it does not depend on any HIP runtime library.
+ * But it does depend on Code Object Manager (comgr). You may try to statically link comgr into HIPRTC to avoid any ambiguity.
+ * Developers can decide to bundle this library with their application.
+
+Compilation APIs
+===============================================================================
+
+To use HIPRTC functionality, HIPRTC header needs to be included first.
+``#include ``
+
+Kernels can be stored in a string:
+
+.. code-block:: cpp
+
+ static constexpr auto kernel_source {
+ R"(
+ extern "C"
+ __global__ void vector_add(float* output, float* input1, float* input2, size_t size) {
+ int i = threadIdx.x;
+ if (i < size) {
+ output[i] = input1[i] + input2[i];
+ }
+ }
+ )"};
+
+Now to compile this kernel, it needs to be associated with ``hiprtcProgram`` type, which is done by declaring ``hiprtcProgram prog;`` and associating the string of kernel with this program:
+
+.. code-block:: cpp
+
+ hiprtcCreateProgram(&prog, // HIPRTC program handle
+ kernel_source, // HIP kernel source string
+ "vector_add.cpp", // Name of the HIP program, can be null or an empty string
+ 0, // Number of headers
+ NULL, // Header sources
+ NULL); // Name of header files
+
+``hiprtcCreateProgram`` API also allows you to add headers which can be included in your RTC program.
+For online compilation, the compiler pre-defines HIP device API functions, HIP specific types and macros for device compilation, but does not include standard C/C++ headers by default. Users can only include header files provided to ``hiprtcCreateProgram``.
+
+After associating the kernel string with ``hiprtcProgram``, you can now compile this program using:
+
+.. code-block:: cpp
+
+ hiprtcCompileProgram(prog, // hiprtcProgram
+ 0, // Number of options
+ options); // Clang Options [Supported Clang Options](clang_options.md)
+
+``hiprtcCompileProgram`` returns a status value which can be converted to string via ``hiprtcGetErrorString``. If compilation is successful, ``hiprtcCompileProgram`` will return ``HIPRTC_SUCCESS``.
+
+If the compilation fails, you can look up the logs via:
+
+.. code-block:: cpp
+
+ size_t logSize;
+ hiprtcGetProgramLogSize(prog, &logSize);
+
+ if (logSize) {
+ string log(logSize, '\0');
+ hiprtcGetProgramLog(prog, &log[0]);
+ // Corrective action with logs
+ }
+
+If the compilation is successful, you can load the compiled binary in a local variable.
+
+.. code-block:: cpp
+
+ size_t codeSize;
+ hiprtcGetCodeSize(prog, &codeSize);
+
+ vector kernel_binary(codeSize);
+ hiprtcGetCode(prog, kernel_binary.data());
+
+After loading the binary, ``hiprtcProgram`` can be destroyed.
+``hiprtcDestroyProgram(&prog);``
+
+The binary present in ``kernel_binary`` can now be loaded via ``hipModuleLoadData`` API.
+
+.. code-block:: cpp
+
+ hipModule_t module;
+ hipFunction_t kernel;
+
+ hipModuleLoadData(&module, kernel_binary.data());
+ hipModuleGetFunction(&kernel, module, "vector_add");
+
+And now this kernel can be launched via ``hipModule`` APIs.
+
+The full example is below:
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ #include
+ #include
+ #include
+
+ #define CHECK_RET_CODE(call, ret_code) \
+ { \
+ if ((call) != ret_code) { \
+ std::cout << "Failed in call: " << #call << std::endl; \
+ std::abort(); \
+ } \
+ }
+ #define HIP_CHECK(call) CHECK_RET_CODE(call, hipSuccess)
+ #define HIPRTC_CHECK(call) CHECK_RET_CODE(call, HIPRTC_SUCCESS)
+
+ // source code for hiprtc
+ static constexpr auto kernel_source{
+ R"(
+ extern "C"
+ __global__ void vector_add(float* output, float* input1, float* input2, size_t size) {
+ int i = threadIdx.x;
+ if (i < size) {
+ output[i] = input1[i] + input2[i];
+ }
+ }
+ )"};
+
+ int main() {
+ hiprtcProgram prog;
+ auto rtc_ret_code = hiprtcCreateProgram(&prog, // HIPRTC program handle
+ kernel_source, // kernel source string
+ "vector_add.cpp", // Name of the file
+ 0, // Number of headers
+ NULL, // Header sources
+ NULL); // Name of header file
+
+ if (rtc_ret_code != HIPRTC_SUCCESS) {
+ std::cout << "Failed to create program" << std::endl;
+ std::abort();
+ }
+
+ hipDeviceProp_t props;
+ int device = 0;
+ HIP_CHECK(hipGetDeviceProperties(&props, device));
+ std::string sarg = std::string("--gpu-architecture=") +
+ props.gcnArchName; // device for which binary is to be generated
+
+ const char* options[] = {sarg.c_str()};
+
+ rtc_ret_code = hiprtcCompileProgram(prog, // hiprtcProgram
+ 0, // Number of options
+ options); // Clang Options
+ if (rtc_ret_code != HIPRTC_SUCCESS) {
+ std::cout << "Failed to create program" << std::endl;
+ std::abort();
+ }
+
+ size_t logSize;
+ HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize));
+
+ if (logSize) {
+ std::string log(logSize, '\0');
+ HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0]));
+ std::cout << "Compilation failed with: " << log << std::endl;
+ std::abort();
+ }
+
+ size_t codeSize;
+ HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize));
+
+ std::vector kernel_binary(codeSize);
+ HIPRTC_CHECK(hiprtcGetCode(prog, kernel_binary.data()));
+
+ HIPRTC_CHECK(hiprtcDestroyProgram(&prog));
+
+ hipModule_t module;
+ hipFunction_t kernel;
+
+ HIP_CHECK(hipModuleLoadData(&module, kernel_binary.data()));
+ HIP_CHECK(hipModuleGetFunction(&kernel, module, "vector_add"));
+
+ constexpr size_t ele_size = 256; // total number of items to add
+ std::vector hinput, output;
+ hinput.reserve(ele_size);
+ output.reserve(ele_size);
+ for (size_t i = 0; i < ele_size; i++) {
+ hinput.push_back(static_cast(i + 1));
+ output.push_back(0.0f);
+ }
+
+ float *dinput1, *dinput2, *doutput;
+ HIP_CHECK(hipMalloc(&dinput1, sizeof(float) * ele_size));
+ HIP_CHECK(hipMalloc(&dinput2, sizeof(float) * ele_size));
+ HIP_CHECK(hipMalloc(&doutput, sizeof(float) * ele_size));
+
+ HIP_CHECK(hipMemcpy(dinput1, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(dinput2, hinput.data(), sizeof(float) * ele_size, hipMemcpyHostToDevice));
+
+ struct {
+ float* output;
+ float* input1;
+ float* input2;
+ size_t size;
+ } args{doutput, dinput1, dinput2, ele_size};
+
+ auto size = sizeof(args);
+ void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+ HIP_LAUNCH_PARAM_END};
+
+ HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, ele_size, 1, 1, 0, nullptr, nullptr, config));
+
+ HIP_CHECK(hipMemcpy(output.data(), doutput, sizeof(float) * ele_size, hipMemcpyDeviceToHost));
+
+ for (size_t i = 0; i < ele_size; i++) {
+ if ((hinput[i] + hinput[i]) != output[i]) {
+ std::cout << "Failed in validation: " << (hinput[i] + hinput[i]) << " - " << output[i]
+ << std::endl;
+ std::abort();
+ }
+ }
+ std::cout << "Passed" << std::endl;
+
+ HIP_CHECK(hipFree(dinput1));
+ HIP_CHECK(hipFree(dinput2));
+ HIP_CHECK(hipFree(doutput));
+ }
+
+HIPRTC specific options
+===============================================================================
+
+HIPRTC provides a few HIPRTC specific flags:
+
+* ``--gpu-architecture`` : This flag can guide the code object generation for a specific gpu arch. Example: ``--gpu-architecture=gfx906:sramecc+:xnack-``, its equivalent to ``--offload-arch``.
+
+ * This option is compulsory if compilation is done on a system without AMD GPUs supported by HIP runtime.
+ * Otherwise, HIPRTC will load the hip runtime and gather the current device and its architecture info and use it as option.
+
+* ``-fgpu-rdc`` : This flag when provided during the ``hiprtcCompileProgram`` generates the bitcode (HIPRTC doesn't convert this bitcode into ISA and binary). This bitcode can later be fetched using ``hiprtcGetBitcode`` and ``hiprtcGetBitcodeSize`` APIs.
+
+Bitcode
+-------------------------------------------------------------------------------
+
+In the usual scenario, the kernel associated with ``hiprtcProgram`` is compiled into the binary which can be loaded and run. However, if ``-fpu-rdc`` option is provided in the compile options, HIPRTC calls comgr and generates only the LLVM bitcode. It doesn't convert this bitcode to ISA and generate the final binary.
+
+.. code-block:: cpp
+
+ std::string sarg = std::string("-fgpu-rdc");
+ const char* options[] = {
+ sarg.c_str() };
+ hiprtcCompileProgram(prog, // hiprtcProgram
+ 1, // Number of options
+ options);
+
+If the compilation is successful, one can load the bitcode in a local variable using the bitcode APIs provided by HIPRTC.
+
+.. code-block:: cpp
+
+ size_t bitCodeSize;
+ hiprtcGetBitcodeSize(prog, &bitCodeSize);
+
+ vector kernel_bitcode(bitCodeSize);
+ hiprtcGetBitcode(prog, kernel_bitcode.data());
+
+CU Mode vs WGP mode
+-------------------------------------------------------------------------------
+
+AMD GPUs consist of an array of workgroup processors, each built with 2 compute units (CUs) capable of executing SIMD32. All the CUs inside a workgroup processor use local data share (LDS).
+
+gfx10+ support execution of wavefront in CU mode and work-group processor mode (WGP). Please refer to section 2.3 of `RDNA3 ISA reference `_.
+
+gfx9 and below only supports CU mode.
+
+In WGP mode, 4 warps of a block can simultaneously be executed on the workgroup processor, where as in CU mode only 2 warps of a block can simultaneously execute on a CU. In theory, WGP mode might help with occupancy and increase the performance of certain HIP programs (if not bound to inter warp communication), but might incur performance penalty on other HIP programs which rely on atomics and inter warp communication. This also has effect of how the LDS is split between warps, please refer to `RDNA3 ISA reference `_ for more information.
+
+HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by passing ``-mcumode`` to HIPRTC compile options in ``hiprtcCompileProgram``.
+
+Linker APIs
+===============================================================================
+
+The bitcode generated using the HIPRTC Bitcode APIs can be loaded using ``hipModule`` APIs and also can be linked with other generated bitcodes with appropriate linker flags using the HIPRTC linker APIs. This also provides more flexibility and optimizations to the applications who want to generate the binary dynamically according to their needs. The input bitcodes can be generated only for a specific architecture or it can be a bundled bitcode which is generated for multiple architectures.
+
+Example
+-------------------------------------------------------------------------------
+
+Firstly, HIPRTC link instance or a pending linker invocation must be created using ``hiprtcLinkCreate``, with the appropriate linker options provided.
+
+.. code-block:: cpp
+
+ hiprtcLinkCreate( num_options, // number of options
+ options, // Array of options
+ option_vals, // Array of option values cast to void*
+ &rtc_link_state ); // HIPRTC link state created upon success
+
+Following which, the bitcode data can be added to this link instance via ``hiprtcLinkAddData`` (if the data is present as a string) or ``hiprtcLinkAddFile`` (if the data is present as a file) with the appropriate input type according to the data or the bitcode used.
+
+.. code-block:: cpp
+
+ hiprtcLinkAddData(rtc_link_state, // HIPRTC link state
+ input_type, // type of the input data or bitcode
+ bit_code_ptr, // input data which is null terminated
+ bit_code_size, // size of the input data
+ "a", // optional name for this input
+ 0, // size of the options
+ 0, // Array of options applied to this input
+ 0); // Array of option values cast to void*
+
+.. code-block:: cpp
+
+ hiprtcLinkAddFile(rtc_link_state, // HIPRTC link state
+ input_type, // type of the input data or bitcode
+ bc_file_path.c_str(), // path to the input file where bitcode is present
+ 0, // size of the options
+ 0, // Array of options applied to this input
+ 0); // Array of option values cast to void*
+
+Once the bitcodes for multiple architectures are added to the link instance, the linking of the device code must be completed using ``hiprtcLinkComplete`` which generates the final binary.
+
+.. code-block:: cpp
+
+ hiprtcLinkComplete(rtc_link_state, // HIPRTC link state
+ &binary, // upon success, points to the output binary
+ &binarySize); // size of the binary is stored (optional)
+
+If the ``hiprtcLinkComplete`` returns successfully, the generated binary can be loaded and run using the ``hipModule*`` APIs.
+
+.. code-block:: cpp
+
+ hipModuleLoadData(&module, binary);
+
+.. note::
+
+ * The compiled binary must be loaded before HIPRTC link instance is destroyed using the ``hiprtcLinkDestroy`` API.
+
+ .. code-block:: cpp
+
+ hiprtcLinkDestroy(rtc_link_state);
+
+ * The correct sequence of calls is : ``hiprtcLinkCreate``, ``hiprtcLinkAddData`` or ``hiprtcLinkAddFile``, ``hiprtcLinkComplete``, ``hiprtcModuleLoadData``, ``hiprtcLinkDestroy``.
+
+Input Types
+-------------------------------------------------------------------------------
+
+HIPRTC provides ``hiprtcJITInputType`` enumeration type which defines the input types accepted by the Linker APIs. Here are the ``enum`` values of ``hiprtcJITInputType``. However only the input types ``HIPRTC_JIT_INPUT_LLVM_BITCODE``, ``HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE`` and ``HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE`` are supported currently.
+
+``HIPRTC_JIT_INPUT_LLVM_BITCODE`` can be used to load both LLVM bitcode or LLVM IR assembly code. However, ``HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE`` and ``HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE`` are only for bundled bitcode and archive of bundled bitcode.
+
+.. code-block:: cpp
+
+ HIPRTC_JIT_INPUT_CUBIN = 0,
+ HIPRTC_JIT_INPUT_PTX,
+ HIPRTC_JIT_INPUT_FATBINARY,
+ HIPRTC_JIT_INPUT_OBJECT,
+ HIPRTC_JIT_INPUT_LIBRARY,
+ HIPRTC_JIT_INPUT_NVVM,
+ HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES,
+ HIPRTC_JIT_INPUT_LLVM_BITCODE = 100,
+ HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101,
+ HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102,
+ HIPRTC_JIT_NUM_INPUT_TYPES = (HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES + 3)
+
+Backward Compatibility of LLVM Bitcode/IR
+-------------------------------------------------------------------------------
+
+For HIP applications utilizing HIPRTC to compile LLVM bitcode/IR, compatibility is assured only when the ROCm or HIP SDK version used for generating the LLVM bitcode/IR matches the version used during the runtime compilation. When an application requires the ingestion of bitcode/IR not derived from the currently installed AMD compiler, it must run with HIPRTC and comgr dynamic libraries that are compatible with the version of the bitcode/IR.
+
+comgr, a shared library, incorporates the LLVM/Clang compiler that HIPRTC relies on. To identify the bitcode/IR version that comgr is compatible with, one can execute "clang -v" using the clang binary from the same ROCm or HIP SDK package. For instance, if compiling bitcode/IR version 14, the HIPRTC and comgr libraries released by AMD around mid 2022 would be the best choice, assuming the LLVM/Clang version included in the package is also version 14.
+
+To ensure smooth operation and compatibility, an application may choose to ship the specific versions of HIPRTC and comgr dynamic libraries, or it may opt to clearly specify the version requirements and dependencies. This approach guarantees that the application can correctly compile the specified version of bitcode/IR.
+
+Link Options
+-------------------------------------------------------------------------------
+
+* ``HIPRTC_JIT_IR_TO_ISA_OPT_EXT`` - AMD Only. Options to be passed on to link step of compiler by ``hiprtcLinkCreate``.
+* ``HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT`` - AMD Only. Count of options passed on to link step of compiler.
+
+Example:
+
+.. code-block:: cpp
+
+ const char* isaopts[] = {"-mllvm", "-inline-threshold=1", "-mllvm", "-inlinehint-threshold=1"};
+ std::vector jit_options = {HIPRTC_JIT_IR_TO_ISA_OPT_EXT,
+ HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
+ size_t isaoptssize = 4;
+ const void* lopts[] = {(void*)isaopts, (void*)(isaoptssize)};
+ hiprtcLinkState linkstate;
+ hiprtcLinkCreate(2, jit_options.data(), (void**)lopts, &linkstate);
+
+Error Handling
+===============================================================================
+
+HIPRTC defines the ``hiprtcResult`` enumeration type and a function ``hiprtcGetErrorString`` for API call error handling. ``hiprtcResult`` ``enum`` defines the API result codes. HIPRTC APIs return ``hiprtcResult`` to indicate the call result. ``hiprtcGetErrorString`` function returns a string describing the given ``hiprtcResult`` code, e.g., HIPRTC_SUCCESS to "HIPRTC_SUCCESS". For unrecognized enumeration values, it returns "Invalid HIPRTC error code".
+
+``hiprtcResult`` ``enum`` supported values and the ``hiprtcGetErrorString`` usage are mentioned below.
+
+.. code-block:: cpp
+
+ HIPRTC_SUCCESS = 0,
+ HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+ HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+ HIPRTC_ERROR_INVALID_INPUT = 3,
+ HIPRTC_ERROR_INVALID_PROGRAM = 4,
+ HIPRTC_ERROR_INVALID_OPTION = 5,
+ HIPRTC_ERROR_COMPILATION = 6,
+ HIPRTC_ERROR_LINKING = 7,
+ HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 8,
+ HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 9,
+ HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 10,
+ HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 11,
+ HIPRTC_ERROR_INTERNAL_ERROR = 12
+
+.. code-block:: cpp
+
+ hiprtcResult result;
+ result = hiprtcCompileProgram(prog, 1, opts);
+ if (result != HIPRTC_SUCCESS) {
+ std::cout << "hiprtcCompileProgram fails with error " << hiprtcGetErrorString(result);
+ }
+
+HIPRTC General APIs
+===============================================================================
+
+HIPRTC provides the following API for querying the version.
+
+``hiprtcVersion(int* major, int* minor)`` - This sets the output parameters major and minor with the HIP Runtime compilation major version and minor version number respectively.
+
+Currently, it returns hardcoded value. This should be implemented to return HIP runtime major and minor version in the future releases.
+
+Lowered Names (Mangled Names)
+===============================================================================
+
+HIPRTC mangles the ``__global__`` function names and names of ``__device__`` and ``__constant__`` variables. If the generated binary is being loaded using the HIP Runtime API, the kernel function or ``__device__/__constant__`` variable must be looked up by name, but this is very hard when the name has been mangled. To overcome this, HIPRTC provides API functions that map ``__global__`` function or ``__device__/__constant__`` variable names in the source to the mangled names present in the generated binary.
+
+The two APIs ``hiprtcAddNameExpression`` and ``hiprtcGetLoweredName`` provide this functionality. First, a 'name expression' string denoting the address for the ``__global__`` function or ``__device__/__constant__`` variable is provided to ``hiprtcAddNameExpression``. Then, the program is compiled with ``hiprtcCompileProgram``. During compilation, HIPRTC will parse the name expression string as a C++ constant expression at the end of the user program. Finally, the function ``hiprtcGetLoweredName`` is called with the original name expression and it returns a pointer to the lowered name. The lowered name can be used to refer to the kernel or variable in the HIP Runtime API.
+
+.. note::
+
+ * The identical name expression string must be provided on a subsequent call to ``hiprtcGetLoweredName`` to extract the lowered name.
+ * The correct sequence of calls is : ``hiprtcAddNameExpression``, ``hiprtcCompileProgram``, ``hiprtcGetLoweredName``, ``hiprtcDestroyProgram``.
+ * The lowered names must be fetched using ``hiprtcGetLoweredName`` only after the HIPRTC program has been compiled, and before it has been destroyed.
+
+Example
+-------------------------------------------------------------------------------
+
+kernel containing various definitions ``__global__`` functions/function templates and ``__device__/__constant__`` variables can be stored in a string.
+
+.. code-block:: cpp
+
+ static constexpr const char gpu_program[] {
+ R"(
+ __device__ int V1; // set from host code
+ static __global__ void f1(int *result) { *result = V1 + 10; }
+ namespace N1 {
+ namespace N2 {
+ __constant__ int V2; // set from host code
+ __global__ void f2(int *result) { *result = V2 + 20; }
+ }
+ }
+ template
+ __global__ void f3(int *result) { *result = sizeof(T); }
+ )"};
+
+``hiprtcAddNameExpression`` is called with various name expressions referring to the address of ``__global__`` functions and ``__device__/__constant__`` variables.
+
+.. code-block:: cpp
+
+ kernel_name_vec.push_back("&f1");
+ kernel_name_vec.push_back("N1::N2::f2");
+ kernel_name_vec.push_back("f3");
+ for (auto&& x : kernel_name_vec) hiprtcAddNameExpression(prog, x.c_str());
+ variable_name_vec.push_back("&V1");
+ variable_name_vec.push_back("&N1::N2::V2");
+ for (auto&& x : variable_name_vec) hiprtcAddNameExpression(prog, x.c_str());
+
+After which, the program is compiled using ``hiprtcCompileProgram`` and the generated binary is loaded using ``hipModuleLoadData``. And the mangled names can be fetched using ``hirtcGetLoweredName``.
+
+.. code-block:: cpp
+
+ for (decltype(variable_name_vec.size()) i = 0; i != variable_name_vec.size(); ++i) {
+ const char* name;
+ hiprtcGetLoweredName(prog, variable_name_vec[i].c_str(), &name);
+ }
+
+.. code-block:: cpp
+
+ for (decltype(kernel_name_vec.size()) i = 0; i != kernel_name_vec.size(); ++i) {
+ const char* name;
+ hiprtcGetLoweredName(prog, kernel_name_vec[i].c_str(), &name);
+ }
+
+The mangled name of the variables are used to look up the variable in the module and update its value.
+
+.. code-block:: cpp
+
+ hipDeviceptr_t variable_addr;
+ size_t bytes{};
+ hipModuleGetGlobal(&variable_addr, &bytes, module, name);
+ hipMemcpyHtoD(variable_addr, &initial_value, sizeof(initial_value));
+
+
+Finally, the mangled name of the kernel is used to launch it using the ``hipModule`` APIs.
+
+.. code-block:: cpp
+
+ hipFunction_t kernel;
+ hipModuleGetFunction(&kernel, module, name);
+ hipModuleLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, nullptr, nullptr, config);
+
+Please have a look at ``hiprtcGetLoweredName.cpp`` for the detailed example.
+
+Versioning
+===============================================================================
+
+HIPRTC follows the below versioning.
+
+* Linux
+
+ * HIPRTC follows the same versioning as HIP runtime library.
+ * The ``so`` name field for the shared library is set to MAJOR version. For example, for HIP 5.3 the ``so`` name is set to 5 (``hiprtc.so.5``).
+
+* Windows
+
+ * HIPRTC dll is named as ``hiprtcXXYY.dll`` where XX is MAJOR version and YY is MINOR version. For example, for HIP 5.3 the name is ``hiprtc0503.dll``.
+
+HIP header support
+===============================================================================
+
+* Added HIPRTC support for all the hip common header files such as library_types.h, hip_math_constants.h, hip_complex.h, math_functions.h, surface_types.h etc. from 6.1. HIPRTC users need not include any HIP macros or constants explicitly in their header files. All of these should get included via HIPRTC builtins when the app links to HIPRTC library.
+
+Deprecation notice
+===============================================================================
+
+* Currently HIPRTC APIs are separated from HIP APIs and HIPRTC is available as a separate library ``libhiprtc.so``/``libhiprtc.dll``. But on Linux, HIPRTC symbols are also present in ``libamdhip64.so`` in order to support the existing applications. Gradually, these symbols will be removed from HIP library and applications using HIPRTC will be required to explicitly link to HIPRTC library. However, on Windows ``hiprtc.dll`` must be used as the ``amdhip64.dll`` doesn't contain the HIPRTC symbols.
+* Data types such as ``uint32_t``, ``uint64_t``, ``int32_t``, ``int64_t`` defined in std namespace in HIPRTC are deprecated earlier and are being removed from ROCm release 6.1 since these can conflict with the standard C++ data types. These data types are now prefixed with ``__hip__``, e.g. ``__hip_uint32_t``. Applications previously using ``std::uint32_t`` or similar types can use ``__hip_`` prefixed types to avoid conflicts with standard std namespace or application can have their own definitions for these types. Also, type_traits templates previously defined in std namespace are moved to ``__hip_internal`` namespace as implementation details.
diff --git a/docs/how-to/hip_runtime_api.rst b/docs/how-to/hip_runtime_api.rst
new file mode 100644
index 0000000000..223f1b296e
--- /dev/null
+++ b/docs/how-to/hip_runtime_api.rst
@@ -0,0 +1,48 @@
+.. meta::
+ :description: HIP runtime API usage
+ :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API How to,
+
+.. _hip_runtime_api_how-to:
+
+********************************************************************************
+Using HIP runtime API
+********************************************************************************
+
+The HIP runtime API provides C and C++ functionalities to manage event, stream,
+and memory on GPUs. On the AMD platform, the HIP runtime uses
+:doc:`Compute Language Runtime (CLR) <../understand/amd_clr>`, while on NVIDIA
+CUDA platform, it is only a thin layer over the CUDA runtime or Driver API.
+
+- **CLR** contains source code for AMD's compute language runtimes: ``HIP`` and
+ ``OpenCL™``. CLR includes the ``HIP`` implementation on the AMD
+ platform: `hipamd `_ and the
+ ROCm Compute Language Runtime (``rocclr``). ``rocclr`` is a
+ virtual device interface that enables the HIP runtime to interact with
+ different backends such as :doc:`ROCr ` on Linux or PAL on
+ Windows. CLR also includes the `OpenCL runtime `_
+ implementation.
+- The **CUDA runtime** is built on top of the CUDA driver API, which is a C API
+ with lower-level access to NVIDIA GPUs. For details about the CUDA driver and
+ runtime API with reference to HIP, see :doc:`CUDA driver API porting guide <../how-to/hip_porting_driver_api>`.
+
+The backends of HIP runtime API under AMD and NVIDIA platform are summarized in
+the following figure:
+
+.. figure:: ../data/how-to/hip_runtime_api/runtimes.svg
+
+.. note::
+
+ On NVIDIA platform HIP runtime API calls CUDA runtime or CUDA driver via
+ hipother interface. For more information, see the `hipother repository `_.
+
+Here are the various HIP Runtime API high level functions:
+
+* :doc:`./hip_runtime_api/initialization`
+* :doc:`./hip_runtime_api/memory_management`
+* :doc:`./hip_runtime_api/error_handling`
+* :doc:`./hip_runtime_api/cooperative_groups`
+* :doc:`./hip_runtime_api/hipgraph`
+* :doc:`./hip_runtime_api/call_stack`
+* :doc:`./hip_runtime_api/multi_device`
+* :doc:`./hip_runtime_api/opengl_interop`
+* :doc:`./hip_runtime_api/external_interop`
\ No newline at end of file
diff --git a/docs/how-to/hip_runtime_api/call_stack.rst b/docs/how-to/hip_runtime_api/call_stack.rst
new file mode 100644
index 0000000000..43354cd0cf
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/call_stack.rst
@@ -0,0 +1,129 @@
+.. meta::
+ :description: This page describes call stack concept in HIP
+ :keywords: AMD, ROCm, HIP, call stack
+
+*******************************************************************************
+Call stack
+*******************************************************************************
+
+The call stack is a data structure for managing function calls, by saving the
+state of the current function. Each time a function is called, a new call frame
+is added to the top of the stack, containing information such as local
+variables, return addresses and function parameters. When the function
+execution completes, the frame is removed from the stack and loaded back into
+the corresponding registers. This concept allows the program to return to the
+calling function and continue execution from where it left off.
+
+The call stack for each thread must track its function calls, local variables,
+and return addresses. However, in GPU programming, the memory required to store
+the call stack increases due to the parallelism inherent to the GPUs. NVIDIA
+and AMD GPUs use different approaches. NVIDIA GPUs have the independent thread
+scheduling feature where each thread has its own call stack and effective
+program counter. On AMD GPUs threads are grouped; each warp has its own call
+stack and program counter. Warps are described and explained in the
+:ref:`inherent_thread_hierarchy`
+
+If a thread or warp exceeds its stack size, a stack overflow occurs, causing
+kernel failure. This can be detected using debuggers.
+
+Call stack management with HIP
+===============================================================================
+
+You can adjust the call stack size as shown in the following example, allowing
+fine-tuning based on specific kernel requirements. This helps prevent stack
+overflow errors by ensuring sufficient stack memory is allocated.
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if(status != hipSuccess){ \
+ std::cerr << "HIP error " \
+ << status << ": " \
+ << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ } \
+ }
+
+ int main()
+ {
+ size_t stackSize;
+ HIP_CHECK(hipDeviceGetLimit(&stackSize, hipLimitStackSize));
+ std::cout << "Default stack size: " << stackSize << " bytes" << std::endl;
+
+ // Set a new stack size
+ size_t newStackSize = 1024 * 8; // 8 KiB
+ HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, newStackSize));
+
+ HIP_CHECK(hipDeviceGetLimit(&stackSize, hipLimitStackSize));
+ std::cout << "Updated stack size: " << stackSize << " bytes" << std::endl;
+
+ return 0;
+ }
+
+Depending on the GPU model, at full occupancy, it can consume a significant
+amount of memory. For instance, an MI300X with 304 compute units (CU) and up to
+2048 threads per CU could use 304 · 2048 · 1024 bytes = 608 MiB for the call
+stack by default.
+
+Handling recursion and deep function calls
+-------------------------------------------------------------------------------
+
+Similar to CPU programming, recursive functions and deeply nested function
+calls are supported. However, developers must ensure that these functions do
+not exceed the available stack memory, considering the huge amount of memory
+needed for the call stack due to the GPUs inherent parallelism. This can be
+achieved by increasing stack size or optimizing code to reduce stack usage. To
+detect stack overflow add proper error handling or use debugging tools.
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if(status != hipSuccess){ \
+ std::cerr << "HIP error " \
+ << status << ": " \
+ << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ } \
+ }
+
+ __device__ unsigned long long fibonacci(unsigned long long n)
+ {
+ if (n == 0 || n == 1)
+ {
+ return n;
+ }
+ return fibonacci(n - 1) + fibonacci(n - 2);
+ }
+
+ __global__ void kernel(unsigned long long n)
+ {
+ unsigned long long result = fibonacci(n);
+ const size_t x = threadIdx.x + blockDim.x * blockIdx.x;
+
+ if (x == 0)
+ printf("%llu! = %llu \n", n, result);
+ }
+
+ int main()
+ {
+ kernel<<<1, 1>>>(10);
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // With -O0 optimization option hit the stack limit
+ // kernel<<<1, 256>>>(2048);
+ // HIP_CHECK(hipDeviceSynchronize());
+
+ return 0;
+ }
diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/hip_runtime_api/cooperative_groups.rst
similarity index 92%
rename from docs/how-to/cooperative_groups.rst
rename to docs/how-to/hip_runtime_api/cooperative_groups.rst
index 370d6dc729..f4cecffaed 100644
--- a/docs/how-to/cooperative_groups.rst
+++ b/docs/how-to/hip_runtime_api/cooperative_groups.rst
@@ -8,9 +8,16 @@
Cooperative groups
*******************************************************************************
-Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. Cooperative groups let you define your own set of thread groups which may fit your user-cases better than those defined by the hardware. This lets you specify the level of granularity for thread communication which can lead to more efficient parallel decompositions.
+The cooperative groups API is an extension to the HIP programming model, which
+provides developers with a flexible, dynamic grouping mechanism for the
+communicating threads. Cooperative groups let you define your own set of thread
+groups which may fit your use-cases better than those defined by the hardware.
+This lets you specify the level of granularity for thread communication which
+can lead to more efficient parallel decompositions.
-The API is accessible in the ``cooperative_groups`` namespace after the ``hip_cooperative_groups.h`` is included. The header contains the following elements:
+The API is accessible in the ``cooperative_groups`` namespace after the
+``hip_cooperative_groups.h`` header is included. The header contains the following
+elements:
* Static functions to create groups and subgroups.
* Hardware-accelerated operations over the whole group, like shuffles.
@@ -19,13 +26,13 @@ The API is accessible in the ``cooperative_groups`` namespace after the ``hip_c
* Get group properties member functions.
Cooperative groups thread model
-===============================
+================================================================================
-The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarchy ` and :ref:`block hierarchy `.
+The thread hierarchy abstractions of cooperative groups are depicted in the following figures: :ref:`grid hierarchy ` and :ref:`block hierarchy `.
.. _coop_thread_top_hierarchy:
-.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
+.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg
:alt: Diagram depicting nested rectangles of varying color. The outermost one
titled "Grid", inside sets of different sized rectangles layered on
one another titled "Block". Each "Block" containing sets of uniform
@@ -34,11 +41,16 @@ The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarc
Cooperative group thread hierarchy in grids.
-The **multi grid** is an abstraction of potentially multiple simultaneous launches of the same kernel over multiple devices (Deprecated since 5.0). The **grid** in cooperative groups is a single dispatch of kernels for execution like the original grid.
+The **multi grid** is an abstraction of potentially multiple simultaneous
+launches of the same kernel over multiple devices. The **grid** in cooperative
+groups is a single dispatch of kernels for execution like the original grid.
.. note::
- The ability to synchronize over a grid or multi grid requires the kernel to be launched using the specific cooperative groups API.
+ * The ability to synchronize over a grid or multi grid requires the kernel to
+ be launched using the specific cooperative groups API.
+
+ * Multi grid deprecated since ROCm 5.0.
The **block** is the same as the :ref:`inherent_thread_model` block entity.
@@ -48,7 +60,7 @@ The **block** is the same as the :ref:`inherent_thread_model` block entity.
.. _coop_thread_bottom_hierarchy:
-.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
+.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg
:alt: The new level between block thread and threads.
Cooperative group thread hierarchy in blocks.
diff --git a/docs/how-to/hip_runtime_api/error_handling.rst b/docs/how-to/hip_runtime_api/error_handling.rst
new file mode 100644
index 0000000000..b9db4ea544
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/error_handling.rst
@@ -0,0 +1,136 @@
+.. meta::
+ :description: Error Handling
+ :keywords: AMD, ROCm, HIP, error handling, error
+
+.. _error_handling:
+
+********************************************************************************
+Error handling
+********************************************************************************
+
+HIP provides functionality to detect, report, and manage errors that occur
+during the execution of HIP runtime functions or when launching kernels. Every
+HIP runtime function, apart from launching kernels, has :cpp:type:`hipError_t`
+as return type. :cpp:func:`hipGetLastError` and :cpp:func:`hipPeekAtLastError`
+can be used for catching errors from kernel launches, as kernel launches don't
+return an error directly. HIP maintains an internal state, that includes the
+last error code. :cpp:func:`hipGetLastError` returns and resets that error to
+``hipSuccess``, while :cpp:func:`hipPeekAtLastError` just returns the error
+without changing it. To get a human readable version of the errors,
+:cpp:func:`hipGetErrorString` and :cpp:func:`hipGetErrorName` can be used.
+
+.. note::
+
+ :cpp:func:`hipGetLastError` returns the returned error code of the last HIP
+ runtime API call even if it's ``hipSuccess``, while ``cudaGetLastError``
+ returns the error returned by any of the preceding CUDA APIs in the same
+ host thread. :cpp:func:`hipGetLastError` behavior will be matched with
+ ``cudaGetLastError`` in ROCm release 7.0.
+
+Best practices of HIP error handling:
+
+1. Check errors after each API call - Avoid error propagation.
+2. Use macros for error checking - Check :ref:`hip_check_macros`.
+3. Handle errors gracefully - Free resources and provide meaningful error
+ messages to the user.
+
+For more details on the error handling functions, see :ref:`error handling
+functions reference page `.
+
+.. _hip_check_macros:
+
+HIP check macros
+================================================================================
+
+HIP uses check macros to simplify error checking and reduce code duplication.
+The ``HIP_CHECK`` macros are mainly used to detect and report errors. It can
+also exit from application with ``exit(1);`` function call after the error
+print. The ``HIP_CHECK`` macro example:
+
+.. code-block:: cpp
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if(status != hipSuccess){ \
+ std::cerr << "HIP error " \
+ << status << ": " \
+ << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ } \
+ }
+
+Complete example
+================================================================================
+
+A complete example to demonstrate the error handling with a simple addition of
+two values kernel:
+
+.. code-block:: cpp
+
+ #include
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if(status != hipSuccess){ \
+ std::cerr << "HIP error " \
+ << status << ": " \
+ << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c, size_t size) {
+ const size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+ if(index < size) {
+ c[index] += a[index] + b[index];
+ }
+ }
+
+ int main() {
+ constexpr int numOfBlocks = 256;
+ constexpr int threadsPerBlock = 256;
+ constexpr size_t arraySize = 1U << 16;
+
+ std::vector a(arraySize), b(arraySize), c(arraySize);
+ int *d_a, *d_b, *d_c;
+
+ // Setup input values.
+ std::fill(a.begin(), a.end(), 1);
+ std::fill(b.begin(), b.end(), 2);
+
+ // Allocate device copies of a, b and c.
+ HIP_CHECK(hipMalloc(&d_a, arraySize * sizeof(*d_a)));
+ HIP_CHECK(hipMalloc(&d_b, arraySize * sizeof(*d_b)));
+ HIP_CHECK(hipMalloc(&d_c, arraySize * sizeof(*d_c)));
+
+ // Copy input values to device.
+ HIP_CHECK(hipMemcpy(d_a, &a, arraySize * sizeof(*d_a), hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d_b, &b, arraySize * sizeof(*d_b), hipMemcpyHostToDevice));
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(numOfBlocks), dim3(threadsPerBlock), 0, 0, d_a, d_b, d_c, arraySize);
+ // Check the kernel launch
+ HIP_CHECK(hipGetLastError());
+ // Check for kernel execution error
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Copy the result back to the host.
+ HIP_CHECK(hipMemcpy(&c, d_c, arraySize * sizeof(*d_c), hipMemcpyDeviceToHost));
+
+ // Cleanup allocated memory.
+ HIP_CHECK(hipFree(d_a));
+ HIP_CHECK(hipFree(d_b));
+ HIP_CHECK(hipFree(d_c));
+
+ // Print the result.
+ std::cout << a[0] << " + " << b[0] << " = " << c[0] << std::endl;
+
+ return 0;
+ }
diff --git a/docs/how-to/hip_runtime_api/external_interop.rst b/docs/how-to/hip_runtime_api/external_interop.rst
new file mode 100644
index 0000000000..f8f2a97d08
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/external_interop.rst
@@ -0,0 +1,140 @@
+.. meta::
+ :description: HIP provides an external resource interoperability API that
+ allows efficient data sharing between HIP's computing power and
+ OpenGL's graphics rendering.
+ :keywords: AMD, ROCm, HIP, external, interop, interoperability
+
+*******************************************************************************
+External resource interoperability
+*******************************************************************************
+
+This feature allows HIP to work with resources -- like memory and semaphores --
+created by other APIs. This means resources can be used from APIs like CUDA,
+OpenCL and Vulkan within HIP, making it easier to integrate HIP into existing
+projects.
+
+To use external resources in HIP, you typically follow these steps:
+
+- Import resources from other APIs using HIP provided functions
+- Use external resources as if they were created in HIP
+- Destroy the HIP resource object to clean up
+
+Semaphore Functions
+===============================================================================
+
+Semaphore functions are essential for synchronization in parallel computing.
+These functions facilitate communication and coordination between different
+parts of a program or between different programs. By managing semaphores, tasks
+are executed in the correct order, and resources are utilized effectively.
+Semaphore functions ensure smooth operation, preventing conflicts and
+maintaining the integrity of processes; upholding the integrity and performance
+of concurrent processes.
+
+External semaphore functions can be used in HIP as described in :ref:`external_resource_interoperability_reference`.
+
+Memory Functions
+===============================================================================
+
+HIP external memory functions focus on the efficient sharing and management of
+memory resources. These functions enable importing memory created by external
+systems, enabling the HIP program to use this memory seamlessly. Memory
+functions include mapping memory for effective use and ensuring proper cleanup
+to prevent resource leaks. This is critical for performance, particularly in
+applications handling large datasets or complex structures such as textures in
+graphics. Proper memory management ensures stability and efficient resource
+utilization.
+
+Example
+===============================================================================
+
+ROCm examples include a
+`HIP--Vulkan interoperation example `_
+demonstrates how to perform interoperation between HIP and Vulkan.
+
+In this example, a simple HIP kernel is used to compute a sine wave, which is
+then rendered to a window as a graphical output using Vulkan. The process
+requires several initialization steps, such as setting up a HIP context,
+creating a Vulkan instance, and configuring the GPU device and queue. After
+these initial steps, the kernel executes the sine wave computation, and Vulkan
+continuously updates the window framebuffer to display the computed data until
+the window is closed.
+
+The following code converts a Vulkan memory handle to its equivalent HIP
+handle. The input ``VkDeviceMemory`` and the created HIP memory represents the
+same physical area of GPU memory, through the handles of each respective API.
+Writing to the buffer in one API will allow us to read the results through the
+other. Note that access to the buffer should be synchronized between the APIs,
+for example using queue syncs or semaphores.
+
+..
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+ :start-after: // [Sphinx vulkan memory to hip start]
+ :end-before: // [Sphinx vulkan memory to hip end]
+ :language: cpp
+
+..
+
+The Vulkan semaphore is converted to HIP semaphore shown in the following
+example. Signaling on the semaphore in one API will allow the other API to wait
+on it, which is how we can guarantee synchronized access to resources in a
+cross-API manner.
+
+..
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+ :start-after: // [Sphinx semaphore import start]
+ :end-before: // [Sphinx semaphore import end]
+ :language: cpp
+
+..
+
+When the HIP external memory is exported from Vulkan and imported to HIP, it is
+not yet ready for use. The Vulkan handle is shared, allowing for memory sharing
+rather than copying during the export process. To actually use the memory, we
+need to map it to a pointer so that we may pass it to the kernel so that it can
+be read from and written to. The external memory map to HIP in the following
+example:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+ :start-after: // [Sphinx map external memory start]
+ :end-before: // [Sphinx map external memory end]
+ :language: cpp
+
+..
+
+Wait for buffer is ready and not under modification at Vulkan side:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+ :start-after: // [Sphinx wait semaphore start]
+ :end-before: // [Sphinx wait semaphore end]
+ :language: cpp
+
+..
+
+The sinewave kernel implementation:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+ :start-after: [Sphinx sinewave kernel start]
+ :end-before: // [Sphinx sinewave kernel end]
+ :language: cpp
+
+..
+
+Signal to Vulkan that we are done with the buffer and that it can proceed with
+rendering:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+ :start-after: // [Sphinx signal semaphore start]
+ :end-before: // [Sphinx signal semaphore end]
+ :language: cpp
+
+..
\ No newline at end of file
diff --git a/docs/how-to/hipgraph.rst b/docs/how-to/hip_runtime_api/hipgraph.rst
similarity index 98%
rename from docs/how-to/hipgraph.rst
rename to docs/how-to/hip_runtime_api/hipgraph.rst
index 958784a71f..01c036af2b 100644
--- a/docs/how-to/hipgraph.rst
+++ b/docs/how-to/hip_runtime_api/hipgraph.rst
@@ -12,7 +12,7 @@ HIP graphs
The HIP graph API is currently in Beta. Some features can change and might
have outstanding issues. Not all features supported by CUDA graphs are yet
supported. For a list of all currently supported functions see the
- :doc:`HIP graph API documentation<../doxygen/html/group___graph>`.
+ :ref:`HIP graph API documentation`.
HIP graphs are an alternative way of executing tasks on a GPU that can provide
performance benefits over launching kernels using the standard
@@ -35,7 +35,7 @@ The nodes can be one of the following:
The following figure visualizes the concept of graphs, compared to using streams.
-.. figure:: ../data/how-to/hipgraph/hip_graph.svg
+.. figure:: ../../data/how-to/hip_runtime_api/hipgraph/hip_graph.svg
:alt: Diagram depicting the difference between using streams to execute
kernels with dependencies, resolved by explicitly synchronizing,
or using graphs, where the edges denote the dependencies.
@@ -56,7 +56,7 @@ HIP runtime takes care of executing the operations within the graph.
Graphs can provide additional performance benefits, by enabling optimizations
that are only possible when knowing the dependencies between the operations.
-.. figure:: ../data/how-to/hipgraph/hip_graph_speedup.svg
+.. figure:: ../../data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.svg
:alt: Diagram depicting the speed up achievable with HIP graphs compared to
HIP streams when launching many short-running kernels.
@@ -316,11 +316,11 @@ edges of the graph, thereby forming the graph structure.
The nodes are represented by the generic :cpp:type:`hipGraphNode_t` type. The actual
node type is implicitly defined by the specific function used to add the node to
the graph, for example :cpp:func:`hipGraphAddKernelNode` See the
-:doc:`HIP graph API documentation<../doxygen/html/group___graph>` for the
+:ref:`HIP graph API documentation` for the
available functions, they are of type ``hipGraphAdd{Type}Node``. Each type of
node also has a predefined set of parameters depending on the operation, for
example :cpp:class:`hipKernelNodeParams` for a kernel launch. See the
-:doc:`documentation for the general hipGraphNodeParams type<../doxygen/html/structhip_graph_node_params>`
+:doc:`documentation for the general hipGraphNodeParams type<../../doxygen/html/structhip_graph_node_params>`
for a list of available parameter types and their members.
The general flow for explicitly creating a graph is usually:
diff --git a/docs/how-to/hip_runtime_api/initialization.rst b/docs/how-to/hip_runtime_api/initialization.rst
new file mode 100644
index 0000000000..cf2c6a495a
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/initialization.rst
@@ -0,0 +1,107 @@
+.. meta::
+ :description: Initialization.
+ :keywords: AMD, ROCm, HIP, initialization
+
+.. _initialization:
+
+********************************************************************************
+Initialization
+********************************************************************************
+
+The initialization involves setting up the environment and resources needed for
+using GPUs. The following steps are covered with the initialization:
+
+- Setting up the HIP runtime
+
+ This includes reading the environment variables set during init, setting up
+ the active or visible devices, loading necessary libraries, setting up
+ internal buffers for memory copies or cooperative launches, initialize the
+ compiler as well as HSA runtime and checks any errors due to lack of resources
+ or no active devices.
+
+- Querying and setting GPUs
+
+ Identifying and querying the available GPU devices on the system.
+
+- Setting up contexts
+
+ Creating contexts for each GPU device, which are essential for managing
+ resources and executing kernels. For further details, check the :ref:`context
+ section `.
+
+Initialize the HIP runtime
+================================================================================
+
+The HIP runtime is initialized automatically when the first HIP API call is
+made. However, you can explicitly initialize it using :cpp:func:`hipInit`,
+to be able to control the timing of the initialization. The manual
+initialization can be useful to ensure that the GPU is initialized and
+ready, or to isolate GPU initialization time from other parts of
+your program.
+
+.. note::
+
+ You can use :cpp:func:`hipDeviceReset` to delete all streams created, memory
+ allocated, kernels running and events created by the current process. Any new
+ HIP API call initializes the HIP runtime again.
+
+Querying and setting GPUs
+================================================================================
+
+If multiple GPUs are available in the system, you can query and select the
+desired GPU(s) to use based on device properties, such as size of global memory,
+size shared memory per block, support of cooperative launch and support of
+managed memory.
+
+Querying GPUs
+--------------------------------------------------------------------------------
+
+The properties of a GPU can be queried using :cpp:func:`hipGetDeviceProperties`,
+which returns a struct of :cpp:struct:`hipDeviceProp_t`. The properties in the
+struct can be used to identify a device or give an overview of hardware
+characteristics, that might make one GPU better suited for the task than others.
+
+The :cpp:func:`hipGetDeviceCount` function returns the number of available GPUs,
+which can be used to loop over the available GPUs.
+
+Example code of querying GPUs:
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ int main() {
+
+ int deviceCount;
+ if (hipGetDeviceCount(&deviceCount) == hipSuccess){
+ for (int i = 0; i < deviceCount; ++i){
+ hipDeviceProp_t prop;
+ if ( hipGetDeviceProperties(&prop, i) == hipSuccess)
+ std::cout << "Device" << i << prop.name << std::endl;
+ }
+ }
+
+ return 0;
+ }
+
+Setting the GPU
+--------------------------------------------------------------------------------
+
+:cpp:func:`hipSetDevice` function select the GPU to be used for subsequent HIP
+operations. This function performs several key tasks:
+
+- Context Binding
+
+ Binds the current thread to the context of the specified GPU device. This
+ ensures that all subsequent operations are executed on the selected device.
+
+- Resource Allocation
+
+ Prepares the device for resource allocation, such as memory allocation and
+ stream creation.
+
+- Check device availability
+
+ Checks for errors in device selection and returns error if the specified
+ device is not available or not capable of executing HIP operations.
diff --git a/docs/how-to/hip_runtime_api/memory_management.rst b/docs/how-to/hip_runtime_api/memory_management.rst
new file mode 100644
index 0000000000..a45947b8e5
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/memory_management.rst
@@ -0,0 +1,52 @@
+.. meta::
+ :description: Memory management and its usage
+ :keywords: AMD, ROCm, HIP, CUDA, memory management
+
+.. _memory_management:
+
+********************************************************************************
+Memory management
+********************************************************************************
+
+Memory management is an important part of the HIP runtime API, when creating
+high-performance applications. Both allocating and copying memory can result in
+bottlenecks, which can significantly impact performance.
+
+The programming model is based on a system with a host and a device, each having
+its own distinct memory. Kernels operate on :ref:`device_memory`, while host functions
+operate on :ref:`host_memory`.
+
+The runtime offers functions for allocating, freeing, and copying device memory,
+along with transferring data between host and device memory.
+
+Here are the various memory management techniques:
+
+* :ref:`coherence_control`
+* :ref:`unified_memory`
+* :ref:`virtual_memory`
+* :ref:`stream_ordered_memory_allocator_how-to`
+
+Memory allocation
+================================================================================
+
+The API calls and the resulting allocations are listed here:
+
+.. list-table:: Memory coherence control
+ :header-rows: 1
+ :align: center
+
+ * - API
+ - Data location
+ - Allocation
+ * - System allocated
+ - Host
+ - :ref:`Pageable `
+ * - :cpp:func:`hipMallocManaged`
+ - Host
+ - :ref:`Managed `
+ * - :cpp:func:`hipHostMalloc`
+ - Host
+ - :ref:`Pinned `
+ * - :cpp:func:`hipMalloc`
+ - Device
+ - Pinned
diff --git a/docs/how-to/hip_runtime_api/memory_management/coherence_control.rst b/docs/how-to/hip_runtime_api/memory_management/coherence_control.rst
new file mode 100644
index 0000000000..7754add29a
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/memory_management/coherence_control.rst
@@ -0,0 +1,178 @@
+.. meta::
+ :description: HIP coherence control
+ ecosystem ROCm software.
+ :keywords: AMD, ROCm, HIP, host memory
+
+.. _coherence_control:
+
+*******************************************************************************
+Coherence control
+*******************************************************************************
+
+Memory coherence describes how memory of a specific part of the system is
+visible to the other parts of the system. For example, how GPU memory is visible
+to the CPU and vice versa. In HIP, host and device memory can be allocated with
+two different types of coherence:
+
+* **Coarse-grained coherence:** The memory is considered up-to-date only after
+ synchronization performed using :cpp:func:`hipDeviceSynchronize`,
+ :cpp:func:`hipStreamSynchronize`, or any blocking operation that acts on the
+ null stream such as :cpp:func:`hipMemcpy`. To avoid the cache from being
+ accessed by a part of the system while simultaneously being written by
+ another, the memory is made visible only after the caches have been flushed.
+
+* **Fine-grained coherence:** The memory is coherent even while being modified
+ by a part of the system. Fine-grained coherence ensures that up-to-date data
+ is visible to others regardless of kernel boundaries. This can be useful if
+ both host and device operate on the same data.
+
+.. note::
+
+ To achieve fine-grained coherence, many AMD GPUs use a limited cache policy,
+ such as leaving these allocations uncached by the GPU or making them read-only.
+
+Mi200 accelerator's hardware based floating point instructions work on
+coarse-grained memory regions. Coarse-grained coherence is typically useful in
+reducing host-device interconnect communication.
+
+To check the availability of fine- and coarse-grained memory pools, use
+``rocminfo``:
+
+.. code-block:: sh
+
+ $ rocminfo
+ ...
+ *******
+ Agent 1
+ *******
+ Name: AMD EPYC 7742 64-Core Processor
+ ...
+ Pool Info:
+ Pool 1
+ Segment: GLOBAL; FLAGS: FINE GRAINED
+ ...
+ Pool 3
+ Segment: GLOBAL; FLAGS: COARSE GRAINED
+ ...
+ *******
+ Agent 9
+ *******
+ Name: gfx90a
+ ...
+ Pool Info:
+ Pool 1
+ Segment: GLOBAL; FLAGS: COARSE GRAINED
+ ...
+
+The APIs, flags and respective memory coherence control are listed in the
+following table:
+
+.. list-table:: Memory coherence control
+ :widths: 25, 35, 20, 20
+ :header-rows: 1
+ :align: center
+
+ * - API
+ - Flag
+ - :cpp:func:`hipMemAdvise` call with argument
+ - Coherence
+ * - ``hipHostMalloc`` :sup:`1`
+ - ``hipHostMallocDefault``
+ -
+ - Fine-grained
+ * - ``hipHostMalloc`` :sup:`1`
+ - ``hipHostMallocNonCoherent``
+ -
+ - Coarse-grained
+ * - ``hipExtMallocWithFlags``
+ - ``hipDeviceMallocDefault``
+ -
+ - Coarse-grained
+ * - ``hipExtMallocWithFlags``
+ - ``hipDeviceMallocFinegrained``
+ -
+ - Fine-grained
+ * - ``hipMallocManaged``
+ -
+ -
+ - Fine-grained
+ * - ``hipMallocManaged``
+ -
+ - ``hipMemAdviseSetCoarseGrain``
+ - Coarse-grained
+ * - ``malloc``
+ -
+ -
+ - Fine-grained
+ * - ``malloc``
+ -
+ - ``hipMemAdviseSetCoarseGrain``
+ - Coarse-grained
+
+:sup:`1` The :cpp:func:`hipHostMalloc` memory allocation coherence mode can be
+affected by the ``HIP_HOST_COHERENT`` environment variable, if the
+``hipHostMallocCoherent``, ``hipHostMallocNonCoherent``, and
+``hipHostMallocMapped`` are unset. If neither these flags nor the
+``HIP_HOST_COHERENT`` environment variable is set, or set as 0, the host memory
+allocation is coarse-grained.
+
+.. note::
+
+ * When ``hipHostMallocMapped`` flag is set, the allocated host memory is
+ fine-grained and the ``hipHostMallocNonCoherent`` flag is ignored.
+ * Setting both the ``hipHostMallocCoherent`` and
+ ``hipHostMallocNonCoherent`` flags leads to an illegal state.
+
+Visibility of synchronization functions
+================================================================================
+
+The fine-grained coherence memory is visible at the synchronization points,
+however the visibility of coarse-grained memory depends on the synchronization
+function used. The effect and visibility of various synchronization functions on
+fine- and coarse-grained memory types are listed here:
+
+.. list-table:: HIP synchronize functions effect and visibility
+
+ * - HIP API
+ - :cpp:func:`hipStreamSynchronize`
+ - :cpp:func:`hipDeviceSynchronize`
+ - :cpp:func:`hipEventSynchronize`
+ - :cpp:func:`hipStreamWaitEvent`
+ * - Synchronization effect
+ - Host waits for all commands in the specified stream to complete
+ - Host waits for all commands in all streams on the specified device to complete
+ - Host waits for the specified event to complete
+ - Stream waits for the specified event to complete
+ * - Fence
+ - System-scope release
+ - System-scope release
+ - System-scope release
+ - None
+ * - Fine-grained host memory visibility
+ - Yes
+ - Yes
+ - Yes
+ - Yes
+ * - Coarse-grained host memory visibility
+ - Yes
+ - Yes
+ - Depends on the used event.
+ - No
+
+You can control the release scope for ``hipEvents``. By default, the GPU
+performs a device-scope acquire and release operation with each recorded event.
+This makes the host and device memory visible to other commands executing on the
+same device.
+
+:cpp:func:`hipEventCreateWithFlags`: You can specify a stronger system-level
+fence by creating the event with ``hipEventCreateWithFlags``:
+
+* ``hipEventReleaseToSystem``: Performs a system-scope release operation when
+ the event is recorded. This makes both fine-grained and coarse-grained host
+ memory visible to other agents in the system, which might also involve
+ heavyweight operations such as cache flushing. Fine-grained memory typically
+ uses lighter-weight in-kernel synchronization mechanisms such as an atomic
+ operation and thus doesn't need to use ``hipEventReleaseToSystem``.
+
+* ``hipEventDisableTiming``: Events created with this flag don't record
+ profiling data, which significantly improves synchronization performance.
diff --git a/docs/how-to/hip_runtime_api/memory_management/device_memory.rst b/docs/how-to/hip_runtime_api/memory_management/device_memory.rst
new file mode 100644
index 0000000000..8b040d40ec
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/memory_management/device_memory.rst
@@ -0,0 +1,52 @@
+.. meta::
+ :description: This chapter describes the device memory of the HIP ecosystem
+ ROCm software.
+ :keywords: AMD, ROCm, HIP, device memory
+
+.. _device_memory:
+
+*******************************************************************************
+Device memory
+*******************************************************************************
+
+Device memory exists on the device, e.g. on GPUs in the video random access
+memory (VRAM), and is accessible by the kernels operating on the device. Recent
+architectures use graphics double data rate (GDDR) synchronous dynamic
+random-access memory (SDRAM) such as GDDR6, or high-bandwidth memory (HBM) such
+as HBM2e. Device memory can be allocated as global memory, constant, texture or
+surface memory.
+
+Global memory
+================================================================================
+
+Read-write storage visible to all threads on a given device. There are
+specialized versions of global memory with different usage semantics which are
+typically backed by the same hardware, but can use different caching paths.
+
+Constant memory
+================================================================================
+
+Read-only storage visible to all threads on a given device. It is a limited
+segment backed by device memory with queryable size. It needs to be set by the
+host before kernel execution. Constant memory provides the best performance
+benefit when all threads within a warp access the same address.
+
+Texture memory
+================================================================================
+
+Read-only storage visible to all threads on a given device and accessible
+through additional APIs. Its origins come from graphics APIs, and provides
+performance benefits when accessing memory in a pattern where the
+addresses are close to each other in a 2D representation of the memory.
+
+The :ref:`texture management module ` of the HIP
+runtime API reference contains the functions of texture memory.
+
+Surface memory
+================================================================================
+
+A read-write version of texture memory, which can be useful for applications
+that require direct manipulation of 1D, 2D, or 3D hipArray_t.
+
+The :ref:`surface objects module ` of HIP runtime API
+contains the functions for creating, destroying and reading surface memory.
\ No newline at end of file
diff --git a/docs/understand/texture_fetching.rst b/docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst
similarity index 90%
rename from docs/understand/texture_fetching.rst
rename to docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst
index 498e5723f3..a7f2873dd5 100644
--- a/docs/understand/texture_fetching.rst
+++ b/docs/how-to/hip_runtime_api/memory_management/device_memory/texture_fetching.rst
@@ -3,11 +3,13 @@
ROCm software.
:keywords: AMD, ROCm, HIP, Texture, Texture Fetching
+.. _texture_fetching:
+
*******************************************************************************
Texture fetching
*******************************************************************************
-`Textures <../doxygen/html/group___texture.html>`_ are more than just a buffer
+`Textures <../../../../doxygen/html/group___texture.html>`_ are more than just a buffer
interpreted as a 1D, 2D, or 3D array.
As textures are associated with graphics, they are indexed using floating-point
@@ -32,7 +34,7 @@ sections.
Here is the sample texture used in this document for demonstration purposes. It
is 2x2 texels and indexed in the [0 to 1] range.
-.. figure:: ../data/understand/textures/original.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/original.png
:width: 150
:alt: Sample texture
:align: center
@@ -66,7 +68,7 @@ The following image shows a texture stretched to a 4x4 pixel quad but still
indexed in the [0 to 1] range. The in-between values are the same as the values
of the nearest texel.
-.. figure:: ../data/understand/textures/nearest.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/nearest.png
:width: 300
:alt: Texture upscaled with nearest point sampling
:align: center
@@ -97,7 +99,7 @@ This following image shows a texture stretched out to a 4x4 pixel quad, but
still indexed in the [0 to 1] range. The in-between values are interpolated
between the neighboring texels.
-.. figure:: ../data/understand/textures/linear.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/linear.png
:width: 300
:alt: Texture upscaled with linear filtering
:align: center
@@ -124,7 +126,7 @@ bounds. The border value must be set before texture fetching.
The following image shows the texture on a 4x4 pixel quad, indexed in the
[0 to 3] range. The out-of-bounds values are the border color, which is yellow.
-.. figure:: ../data/understand/textures/border.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/border.png
:width: 300
:alt: Texture with yellow border color
:align: center
@@ -147,7 +149,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in the
[0 to 3] range. The out-of-bounds values are repeating the values at the edge of
the texture.
-.. figure:: ../data/understand/textures/clamp.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/clamp.png
:width: 300
:alt: Texture with clamp addressing
:align: center
@@ -172,7 +174,7 @@ This creates a repeating image effect.
The following image shows the texture on a 4x4 pixel quad, indexed in the
[0 to 3] range. The out-of-bounds values are repeating the original texture.
-.. figure:: ../data/understand/textures/wrap.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/wrap.png
:width: 300
:alt: Texture with wrap addressing
:align: center
@@ -201,7 +203,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in The
[0 to 3] range. The out-of-bounds values are repeating the original texture, but
mirrored.
-.. figure:: ../data/understand/textures/mirror.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/mirror.png
:width: 300
:alt: Texture with mirror addressing
:align: center
diff --git a/docs/how-to/hip_runtime_api/memory_management/host_memory.rst b/docs/how-to/hip_runtime_api/memory_management/host_memory.rst
new file mode 100644
index 0000000000..97dcca0946
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/memory_management/host_memory.rst
@@ -0,0 +1,217 @@
+.. meta::
+ :description: Host memory of the HIP ecosystem
+ :keywords: AMD, ROCm, HIP, host memory
+
+.. _host_memory:
+
+********************************************************************************
+Host memory
+********************************************************************************
+
+Host memory is the "normal" memory residing in the host RAM and allocated by C
+or C++. Host memory can be allocated in two different ways:
+
+* Pageable memory
+* Pinned memory
+
+The following figure explains how data is transferred in pageable and pinned
+memory.
+
+.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg
+
+The pageable and pinned memory allow you to exercise direct control over
+memory operations, which is known as explicit memory management. When using the
+unified memory, you get a simplified memory model with less control over
+low level memory operations.
+
+The difference in memory transfers between explicit and unified memory
+management is highlighted in the following figure:
+
+.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
+
+For more details on unified memory management, see :doc:`/how-to/hip_runtime_api/memory_management/unified_memory`.
+
+.. _pageable_host_memory:
+
+Pageable memory
+================================================================================
+
+Pageable memory exists on memory blocks known as "pages" that can be migrated to
+other memory storage. For example, migrating memory between CPU sockets on a
+motherboard or in a system whose RAM runs out of space and starts dumping pages
+into the swap partition of the hard drive.
+
+Pageable memory is usually allocated with a call to ``malloc`` or ``new`` in a
+C++ application.
+
+**Example:** Using pageable host memory in HIP
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if(status != hipSuccess){ \
+ std::cerr << "HIP error " \
+ << status << ": " \
+ << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ } \
+ }
+
+ int main()
+ {
+ const int element_number = 100;
+
+ int *host_input, *host_output;
+ // Host allocation
+ host_input = new int[element_number];
+ host_output = new int[element_number];
+
+ // Host data preparation
+ for (int i = 0; i < element_number; i++) {
+ host_input[i] = i;
+ }
+ memset(host_output, 0, element_number * sizeof(int));
+
+ int *device_input, *device_output;
+
+ // Device allocation
+ HIP_CHECK(hipMalloc((int **)&device_input, element_number * sizeof(int)));
+ HIP_CHECK(hipMalloc((int **)&device_output, element_number * sizeof(int)));
+
+ // Device data preparation
+ HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemset(device_output, 0, element_number * sizeof(int)));
+
+ // Run the kernel
+ // ...
+
+ HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+
+ // Free host memory
+ delete[] host_input;
+ delete[] host_output;
+
+ // Free device memory
+ HIP_CHECK(hipFree(device_input));
+ HIP_CHECK(hipFree(device_output));
+ }
+
+.. note::
+
+ :cpp:func:`hipMalloc` and :cpp:func:`hipFree` are blocking calls. However, HIP
+ also provides non-blocking versions :cpp:func:`hipMallocAsync` and
+ :cpp:func:`hipFreeAsync`, which require a stream as an additional argument.
+
+.. _pinned_host_memory:
+
+Pinned memory
+================================================================================
+
+Pinned memory or page-locked memory is stored in pages that are locked in specific sectors in RAM and can't be migrated. The pointer can be used on both
+host and device. Accessing host-resident pinned memory in device kernels is
+generally not recommended for performance, as it can force the data to traverse
+the host-device interconnect such as PCIe, which is much slower than the on-device bandwidth.
+
+The advantage of pinned memory is the improved transfer time between host and
+device. For transfer operations, such as :cpp:func:`hipMemcpy` or :cpp:func:`hipMemcpyAsync`,
+using pinned memory instead of pageable memory on the host can lead to a three times
+improvement in bandwidth.
+
+The disadvantage of pinned memory is the reduced availability of RAM for other processes, which can negatively impact the overall performance of the host.
+
+**Example:** Using pinned memory in HIP
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if(status != hipSuccess){ \
+ std::cerr << "HIP error " \
+ << status << ": " \
+ << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ } \
+ }
+
+ int main()
+ {
+ const int element_number = 100;
+
+ int *host_input, *host_output;
+ // Host allocation
+ HIP_CHECK(hipHostMalloc((int **)&host_input, element_number * sizeof(int)));
+ HIP_CHECK(hipHostMalloc((int **)&host_output, element_number * sizeof(int)));
+
+ // Host data preparation
+ for (int i = 0; i < element_number; i++) {
+ host_input[i] = i;
+ }
+ memset(host_output, 0, element_number * sizeof(int));
+
+ int *device_input, *device_output;
+
+ // Device allocation
+ HIP_CHECK(hipMalloc((int **)&device_input, element_number * sizeof(int)));
+ HIP_CHECK(hipMalloc((int **)&device_output, element_number * sizeof(int)));
+
+ // Device data preparation
+ HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemset(device_output, 0, element_number * sizeof(int)));
+
+ // Run the kernel
+ // ...
+
+ HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+
+ // Free host memory
+ delete[] host_input;
+ delete[] host_output;
+
+ // Free device memory
+ HIP_CHECK(hipFree(device_input));
+ HIP_CHECK(hipFree(device_output));
+ }
+
+.. _memory_allocation_flags:
+
+Memory allocation flags for pinned memory
+--------------------------------------------------------------------------------
+
+The memory allocation for pinned memory can be controlled using ``hipHostMalloc`` flags:
+
+* ``hipHostMallocPortable``: The memory allocation is not restricted to the context making the allocation.
+* ``hipHostMallocMapped``: The memory is allocated into the address space for the current device and the device pointer can be obtained with :cpp:func:`hipHostGetDevicePointer`.
+* ``hipHostMallocNumaUser``: The host memory allocation follows Numa policy specified by the user. Target of Numa policy is to select a CPU that is closest to each GPU. Numa distance is the distance between GPU and CPU devices.
+* ``hipHostMallocWriteCombined``: The memory is allocated as write-combined. Although lacking read efficiency by most CPUs, write-combined allocation might be transferred faster across the PCIe bus on some system configurations. It's a good option for data transfer from host to device via mapped pinned memory.
+* ``hipHostMallocCoherent``: Fine-grained memory is allocated. Overrides ``HIP_HOST_COHERENT`` environment variable for specific allocation. For details, see :ref:`coherence_control`.
+* ``hipHostMallocNonCoherent``: Coarse-grained memory is allocated. Overrides ``HIP_HOST_COHERENT`` environment variable for specific allocation. For details, see :ref:`coherence_control`.
+
+All allocation flags are independent and can be set in any combination. The only
+exception is setting ``hipHostMallocCoherent`` and ``hipHostMallocNonCoherent``
+together, which leads to an illegal state. An example of a valid flag
+combination is calling :cpp:func:`hipHostMalloc` with both
+``hipHostMallocPortable`` and ``hipHostMallocMapped`` flags set. Both the flags
+use the same model and differentiate only between how the surrounding code uses
+the host memory.
+
+.. note::
+
+ By default, each GPU selects a Numa CPU node with the least Numa distance
+ between them. This implies that the host memory is automatically allocated on
+ the closest memory pool of the current GPU device's Numa node. Using
+ :cpp:func:`hipSetDevice` API to set a different GPU increases the Numa
+ distance but still allows you to access the host allocation.
+
+ Numa policy is implemented on Linux and is under development on Microsoft
+ Windows.
\ No newline at end of file
diff --git a/docs/how-to/stream_ordered_allocator.rst b/docs/how-to/hip_runtime_api/memory_management/stream_ordered_allocator.rst
similarity index 99%
rename from docs/how-to/stream_ordered_allocator.rst
rename to docs/how-to/hip_runtime_api/memory_management/stream_ordered_allocator.rst
index 0d130a540d..4b68d65c27 100644
--- a/docs/how-to/stream_ordered_allocator.rst
+++ b/docs/how-to/hip_runtime_api/memory_management/stream_ordered_allocator.rst
@@ -2,6 +2,8 @@
:description:
:keywords: stream, memory allocation, SOMA, stream ordered memory allocator
+.. _stream_ordered_memory_allocator_how-to:
+
*******************************************************************************
Stream Ordered Memory Allocator
*******************************************************************************
diff --git a/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
new file mode 100644
index 0000000000..4aac84ca4f
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/memory_management/unified_memory.rst
@@ -0,0 +1,740 @@
+.. meta::
+ :description: This chapter describes Unified Memory and shows
+ how to use it in AMD HIP.
+ :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory
+
+.. _unified_memory:
+
+*******************************************************************************
+Unified memory management
+*******************************************************************************
+
+In conventional architectures CPUs and attached devices have their own memory
+space and dedicated physical memory backing it up, e.g. normal RAM for CPUs and
+VRAM on GPUs. This way each device can have physical memory optimized for its
+use case. GPUs usually have specialized memory whose bandwidth is a
+magnitude higher than the RAM attached to CPUs.
+
+While providing exceptional performance, this setup typically requires explicit
+memory management, as memory needs to be allocated, copied and freed on the used
+devices and on the host. Additionally, this makes using more than the physically
+available memory on the devices complicated.
+
+Modern GPUs circumvent the problem of having to explicitly manage the memory,
+while still keeping the benefits of the dedicated physical memories, by
+supporting the concept of unified memory. This enables the CPU and the GPUs in
+the system to access host and other GPUs' memory without explicit memory
+management.
+
+Unified memory
+================================================================================
+
+Unified Memory is a single memory address space accessible from any processor
+within a system. This setup simplifies memory management and enables
+applications to allocate data that can be read or written on both CPUs and GPUs
+without explicitly copying it to the specific CPU or GPU. The Unified memory
+model is shown in the following figure.
+
+.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
+
+Unified memory enables the access to memory located on other devices via
+several methods, depending on whether hardware support is available or has to be
+managed by the driver.
+
+Hardware supported on-demand page migration
+--------------------------------------------------------------------------------
+
+When a kernel on the device tries to access a memory address that is not in its
+memory, a page-fault is triggered. The GPU then in turn requests the page from
+the host or an other device, on which the memory is located. The page is then
+unmapped from the source, sent to the device and mapped to the device's memory.
+The requested memory is then available to the processes running on the device.
+
+In case the device's memory is at capacity, a page is unmapped from the device's
+memory first and sent and mapped to host memory. This enables more memory to be
+allocated and used for a GPU, than the GPU itself has physically available.
+
+This level of unified memory support can be very beneficial for sparse accesses
+to an array, that is not often used on the device.
+
+Driver managed page migration
+--------------------------------------------------------------------------------
+
+If the hardware does not support on-demand page migration, then all the pages
+accessed by a kernel have to be resident on the device, so they have to be
+migrated before the kernel is running. Since the driver can not know beforehand,
+what parts of an array are going to be accessed, all pages of all accessed
+arrays have to be migrated. This can lead to significant delays on the first run
+of a kernel, on top of possibly copying more memory than is actually accessed by
+the kernel.
+
+.. _unified memory system requirements:
+
+System requirements
+================================================================================
+
+Unified memory is supported on Linux by all modern AMD GPUs from the Vega
+series onward, as shown in the following table. Unified memory management can
+be achieved by explicitly allocating managed memory using
+:cpp:func:`hipMallocManaged` or marking variables with the ``__managed__``
+attribute. For the latest GPUs, with a Linux kernel that supports
+`Heterogeneous Memory Management (HMM)
+`_, the normal system
+allocator can be used.
+
+.. list-table:: Supported Unified Memory Allocators by GPU architecture
+ :widths: 40, 25, 25
+ :header-rows: 1
+ :align: center
+
+ * - Architecture
+ - :cpp:func:`hipMallocManaged()`, ``__managed__``
+ - ``new``, ``malloc()``
+ * - CDNA3
+ - ✅
+ - ✅ :sup:`1`
+ * - CDNA2
+ - ✅
+ - ✅ :sup:`1`
+ * - CDNA1
+ - ✅
+ - ✅ :sup:`1`
+ * - RDNA1
+ - ✅
+ - ❌
+ * - GCN5
+ - ✅
+ - ❌
+
+✅: **Supported**
+
+❌: **Unsupported**
+
+:sup:`1` Works only with ``XNACK=1`` and kernels with HMM support. First GPU
+access causes recoverable page-fault. For more details, visit `GPU memory
+`_.
+
+.. _unified memory allocators:
+
+Unified memory allocators
+================================================================================
+
+Support for the different unified memory allocators depends on the GPU
+architecture and on the system. For more information, see :ref:`unified memory
+system requirements` and :ref:`checking unified memory management support`.
+
+- **HIP allocated managed memory and variables**
+
+ :cpp:func:`hipMallocManaged()` is a dynamic memory allocator available on
+ all GPUs with unified memory support. For more details, visit
+ :ref:`unified_memory_reference`.
+
+ The ``__managed__`` declaration specifier, which serves as its counterpart,
+ can be utilized for static allocation.
+
+- **System allocated unified memory**
+
+ Starting with CDNA2, the ``new`` and ``malloc()`` system allocators allow
+ you to reserve unified memory. The system allocator is more versatile and
+ offers an easy transition for code written for CPUs to HIP code as the
+ same system allocation API is used.
+
+To ensure the proper functioning of system allocated unified memory on supported
+GPUs, it is essential to configure the environment variable ``XNACK=1`` and use
+a kernel that supports `HMM
+`_. Without this
+configuration, the behavior will be similar to that of systems without HMM
+support. For more details, visit
+`GPU memory `_.
+
+The table below illustrates the expected behavior of managed and unified memory
+functions on ROCm and CUDA, both with and without HMM support.
+
+.. tab-set::
+ .. tab-item:: ROCm allocation behaviour
+ :sync: original-block
+
+ .. list-table:: Comparison of expected behavior of managed and unified memory functions in ROCm
+ :widths: 26, 17, 20, 17, 20
+ :header-rows: 1
+
+ * - call
+ - Allocation origin without HMM or ``XNACK=0``
+ - Access outside the origin without HMM or ``XNACK=0``
+ - Allocation origin with HMM and ``XNACK=1``
+ - Access outside the origin with HMM and ``XNACK=1``
+ * - ``new``, ``malloc()``
+ - host
+ - not accessible on device
+ - host
+ - page-fault migration
+ * - :cpp:func:`hipMalloc()`
+ - device
+ - zero copy [zc]_
+ - device
+ - zero copy [zc]_
+ * - :cpp:func:`hipMallocManaged()`, ``__managed__``
+ - pinned host
+ - zero copy [zc]_
+ - host
+ - page-fault migration
+ * - :cpp:func:`hipHostRegister()`
+ - undefined behavior
+ - undefined behavior
+ - host
+ - page-fault migration
+ * - :cpp:func:`hipHostMalloc()`
+ - pinned host
+ - zero copy [zc]_
+ - pinned host
+ - zero copy [zc]_
+
+ .. tab-item:: CUDA allocation behaviour
+ :sync: cooperative-groups
+
+ .. list-table:: Comparison of expected behavior of managed and unified memory functions in CUDA
+ :widths: 26, 17, 20, 17, 20
+ :header-rows: 1
+
+ * - call
+ - Allocation origin without HMM
+ - Access outside the origin without HMM
+ - Allocation origin with HMM
+ - Access outside the origin with HMM
+ * - ``new``, ``malloc()``
+ - host
+ - not accessible on device
+ - first touch
+ - page-fault migration
+ * - ``cudaMalloc()``
+ - device
+ - not accessible on host
+ - device
+ - page-fault migration
+ * - ``cudaMallocManaged()``, ``__managed__``
+ - host
+ - page-fault migration
+ - first touch
+ - page-fault migration
+ * - ``cudaHostRegister()``
+ - host
+ - page-fault migration
+ - host
+ - page-fault migration
+ * - ``cudaMallocHost()``
+ - pinned host
+ - zero copy [zc]_
+ - pinned host
+ - zero copy [zc]_
+
+.. _checking unified memory support:
+
+.. [zc] Zero copy is a feature, where the memory is pinned to either the device
+ or the host, and won't be transferred when accessed by another device or
+ the host. Instead only the requested memory is transferred, without
+ making an explicit copy, like a normal memory access, hence the term
+ "zero copy".
+
+Checking unified memory support
+--------------------------------------------------------------------------------
+
+The following device attributes can offer information about which :ref:`unified
+memory allocators` are supported. The attribute value is 1 if the functionality
+is supported, and 0 if it is not supported.
+
+.. list-table:: Device attributes for unified memory management
+ :widths: 40, 60
+ :header-rows: 1
+ :align: center
+
+ * - Attribute
+ - Description
+ * - :cpp:enumerator:`hipDeviceAttributeManagedMemory`
+ - Device supports allocating managed memory on this system
+ * - :cpp:enumerator:`hipDeviceAttributePageableMemoryAccess`
+ - Device supports coherently accessing pageable memory without calling :cpp:func:`hipHostRegister()` on it.
+ * - :cpp:enumerator:`hipDeviceAttributeConcurrentManagedAccess`
+ - Full unified memory support. Device can coherently access managed memory concurrently with the CPU
+
+For details on how to get the attributes of a specific device see :cpp:func:`hipDeviceGetAttribute()`.
+
+Example for unified memory management
+--------------------------------------------------------------------------------
+
+The following example shows how to use unified memory with
+:cpp:func:`hipMallocManaged()` for dynamic allocation, the ``__managed__`` attribute
+for static allocation and the standard ``new`` allocation. For comparison, the
+explicit memory management example is presented in the last tab.
+
+.. tab-set::
+
+ .. tab-item:: hipMallocManaged()
+
+ .. code-block:: cpp
+ :emphasize-lines: 22-25
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+
+ // Allocate memory for a, b and c that is accessible to both device and host codes.
+ HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+ HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+ HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Print the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ HIP_CHECK(hipFree(a));
+ HIP_CHECK(hipFree(b));
+ HIP_CHECK(hipFree(c));
+
+ return 0;
+ }
+
+ .. tab-item:: __managed__
+
+ .. code-block:: cpp
+ :emphasize-lines: 19-20
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ // Declare a, b and c as static variables.
+ __managed__ int a, b, c;
+
+ int main() {
+ // Setup input values.
+ a = 1;
+ b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
+
+ // Wait for GPU to finish before accessing on host.
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Prints the result.
+ std::cout << a << " + " << b << " = " << c << std::endl;
+
+ return 0;
+ }
+
+ .. tab-item:: new
+
+ .. code-block:: cpp
+ :emphasize-lines: 20-23
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int* a, int* b, int* c) {
+ *c = *a + *b;
+ }
+
+ // This example requires HMM support and the environment variable HSA_XNACK needs to be set to 1
+ int main() {
+ // Allocate memory for a, b, and c.
+ int *a = new int[1];
+ int *b = new int[1];
+ int *c = new int[1];
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ delete[] a;
+ delete[] b;
+ delete[] c;
+
+ return 0;
+ }
+
+ .. tab-item:: Explicit Memory Management
+
+ .. code-block:: cpp
+ :emphasize-lines: 27-34, 39-40
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int a, b, c;
+ int *d_a, *d_b, *d_c;
+
+ // Setup input values.
+ a = 1;
+ b = 2;
+
+ // Allocate device copies of a, b and c.
+ HIP_CHECK(hipMalloc(&d_a, sizeof(*d_a)));
+ HIP_CHECK(hipMalloc(&d_b, sizeof(*d_b)));
+ HIP_CHECK(hipMalloc(&d_c, sizeof(*d_c)));
+
+ // Copy input values to device.
+ HIP_CHECK(hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice));
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
+
+ // Copy the result back to the host.
+ HIP_CHECK(hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost));
+
+ // Cleanup allocated memory.
+ HIP_CHECK(hipFree(d_a));
+ HIP_CHECK(hipFree(d_b));
+ HIP_CHECK(hipFree(d_c));
+
+ // Prints the result.
+ std::cout << a << " + " << b << " = " << c << std::endl;
+
+ return 0;
+ }
+
+.. _using unified memory:
+
+Using unified memory
+================================================================================
+
+Unified memory can simplify the complexities of memory management in GPU
+computing, by not requiring explicit copies between the host and the devices. It
+can be particularly useful in use cases with sparse memory accesses from both
+the CPU and the GPU, as only the parts of the memory region that are actually
+accessed need to be transferred to the corresponding processor, not the whole
+memory region. This reduces the amount of memory sent over the PCIe bus or other
+interfaces.
+
+In HIP, pinned memory allocations are coherent by default. Pinned memory is
+host memory mapped into the address space of all GPUs, meaning that the pointer
+can be used on both host and device. Additionally, using pinned memory instead of
+pageable memory on the host can improve bandwidth for transfers between the host
+and the GPUs.
+
+While unified memory can provide numerous benefits, it's important to be aware
+of the potential performance overhead associated with unified memory. You must
+thoroughly test and profile your code to ensure it's the most suitable choice
+for your use case.
+
+.. _unified memory runtime hints:
+
+Performance optimizations for unified memory
+================================================================================
+
+There are several ways, in which the developer can guide the runtime to reduce
+copies between devices, in order to improve performance.
+
+Data prefetching
+--------------------------------------------------------------------------------
+
+Data prefetching is a technique used to improve the performance of your
+application by moving data to the desired device before it's actually
+needed. ``hipCpuDeviceId`` is a special constant to specify the CPU as target.
+
+.. code-block:: cpp
+ :emphasize-lines: 33-36,41-42
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+ int deviceId;
+ HIP_CHECK(hipGetDevice(&deviceId)); // Get the current device ID
+
+ // Allocate memory for a, b and c that is accessible to both device and host codes.
+ HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+ HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+ HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Prefetch the data to the GPU device.
+ HIP_CHECK(hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0));
+ HIP_CHECK(hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0));
+ HIP_CHECK(hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0));
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Prefetch the result back to the CPU.
+ HIP_CHECK(hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0));
+
+ // Wait for the prefetch operations to complete.
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ HIP_CHECK(hipFree(a));
+ HIP_CHECK(hipFree(b));
+ HIP_CHECK(hipFree(c));
+
+ return 0;
+ }
+
+Memory advice
+--------------------------------------------------------------------------------
+
+Unified memory runtime hints can be set with :cpp:func:`hipMemAdvise()` to help
+improve the performance of your code if you know the memory usage pattern. There
+are several different types of hints as specified in the enum
+:cpp:enum:`hipMemoryAdvise`, for example, whether a certain device mostly reads
+the memory region, where it should ideally be located, and even whether that
+specific memory region is accessed by a specific device.
+
+For the best performance, profile your application to optimize the
+utilization of HIP runtime hints.
+
+The effectiveness of :cpp:func:`hipMemAdvise()` comes from its ability to inform
+the runtime of the developer's intentions regarding memory usage. When the
+runtime has knowledge of the expected memory access patterns, it can make better
+decisions about data placement, leading to less transfers via the interconnect
+and thereby reduced latency and bandwidth requirements. However, the actual
+impact on performance can vary based on the specific use case and the system.
+
+The following is the updated version of the example above with memory advice
+instead of prefetching.
+
+.. code-block:: cpp
+ :emphasize-lines: 29-41
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int deviceId;
+ HIP_CHECK(hipGetDevice(&deviceId));
+ int *a, *b, *c;
+
+ // Allocate memory for a, b, and c accessible to both device and host codes.
+ HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+ HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+ HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+ // Set memory advice for a and b to be read, located on and accessed by the GPU.
+ HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, deviceId));
+ HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetAccessedBy, deviceId));
+ HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, deviceId));
+
+ HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, deviceId));
+ HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetAccessedBy, deviceId));
+ HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, deviceId));
+
+ // Set memory advice for c to be read, located on and accessed by the CPU.
+ HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId));
+ HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetAccessedBy, hipCpuDeviceId));
+ HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, hipCpuDeviceId));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+ // Cleanup allocated memory.
+ HIP_CHECK(hipFree(a));
+ HIP_CHECK(hipFree(b));
+ HIP_CHECK(hipFree(c));
+
+ return 0;
+ }
+
+Memory range attributes
+--------------------------------------------------------------------------------
+
+:cpp:func:`hipMemRangeGetAttribute()` allows you to query attributes of a given
+memory range. The attributes are given in :cpp:enum:`hipMemRangeAttribute`.
+
+.. code-block:: cpp
+ :emphasize-lines: 44-49
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t err = expression; \
+ if(err != hipSuccess){ \
+ std::cerr << "HIP error: " \
+ << hipGetErrorString(err) \
+ << " at " << __LINE__ << "\n"; \
+ } \
+ }
+
+ // Addition of two values.
+ __global__ void add(int *a, int *b, int *c) {
+ *c = *a + *b;
+ }
+
+ int main() {
+ int *a, *b, *c;
+ unsigned int attributeValue;
+ constexpr size_t attributeSize = sizeof(attributeValue);
+
+ int deviceId;
+ HIP_CHECK(hipGetDevice(&deviceId));
+
+ // Allocate memory for a, b and c that is accessible to both device and host codes.
+ HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+ HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+ HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+ // Setup input values.
+ *a = 1;
+ *b = 2;
+
+ HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, deviceId));
+
+ // Launch add() kernel on GPU.
+ hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+ // Wait for GPU to finish before accessing on host.
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Query an attribute of the memory range.
+ HIP_CHECK(hipMemRangeGetAttribute(&attributeValue,
+ attributeSize,
+ hipMemRangeAttributeReadMostly,
+ a,
+ sizeof(*a)));
+
+ // Prints the result.
+ std::cout << *a << " + " << *b << " = " << *c << std::endl;
+ std::cout << "The array a is" << (attributeValue == 1 ? "" : " NOT") << " set to hipMemRangeAttributeReadMostly" << std::endl;
+
+ // Cleanup allocated memory.
+ HIP_CHECK(hipFree(a));
+ HIP_CHECK(hipFree(b));
+ HIP_CHECK(hipFree(c));
+
+ return 0;
+ }
+
+Asynchronously attach memory to a stream
+--------------------------------------------------------------------------------
+
+The :cpp:func:`hipStreamAttachMemAsync()` function attaches memory to a stream,
+which can reduce the amount of memory transferred, when managed memory is used.
+When the memory is attached to a stream using this function, it only gets
+transferred between devices, when a kernel that is launched on this stream needs
+access to the memory.
diff --git a/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst b/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst
new file mode 100644
index 0000000000..597b54040f
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/memory_management/virtual_memory.rst
@@ -0,0 +1,154 @@
+.. meta::
+ :description: This chapter describes introduces Virtual Memory (VM) and shows
+ how to use it in AMD HIP.
+ :keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU
+
+.. _virtual_memory:
+
+********************************************************************************
+Virtual memory management
+********************************************************************************
+
+Memory management is important when creating high-performance applications in
+the HIP ecosystem. Both allocating and copying memory can result in bottlenecks,
+which can significantly impact performance.
+
+Global memory allocation in HIP uses the C language style allocation function.
+This works fine for simple cases but can cause problems if your memory needs
+change. If you need to increase the size of your memory, you must allocate a
+second larger buffer and copy the data to it before you can free the original
+buffer. This increases overall memory usage and causes unnecessary ``memcpy``
+calls. Another solution is to allocate a larger buffer than you initially need.
+However, this isn't an efficient way to handle resources and doesn't solve the
+issue of reallocation when the extra buffer runs out.
+
+Virtual memory management solves these memory management problems. It helps to
+reduce memory usage and unnecessary ``memcpy`` calls.
+
+.. _memory_allocation_virtual_memory:
+
+Memory allocation
+================================================================================
+
+Standard memory allocation uses the :cpp:func:`hipMalloc` function to allocate a
+block of memory on the device. However, when using virtual memory, this process
+is separated into multiple steps using the :cpp:func:`hipMemCreate`,
+:cpp:func:`hipMemAddressReserve`, :cpp:func:`hipMemMap`, and
+:cpp:func:`hipMemSetAccess` functions. This guide explains what these functions
+do and how you can use them for virtual memory management.
+
+Allocate physical memory
+--------------------------------------------------------------------------------
+
+The first step is to allocate the physical memory itself with the
+:cpp:func:`hipMemCreate` function. This function accepts the size of the buffer,
+an ``unsigned long long`` variable for the flags, and a
+:cpp:struct:`hipMemAllocationProp` variable. :cpp:struct:`hipMemAllocationProp`
+contains the properties of the memory to be allocated, such as where the memory
+is physically located and what kind of shareable handles are available. If the
+allocation is successful, the function returns a value of
+:cpp:enumerator:`hipSuccess`, with :cpp:type:`hipMemGenericAllocationHandle_t`
+representing a valid physical memory allocation. The allocated memory size must
+be aligned with the granularity appropriate for the properties of the
+allocation. You can use the :cpp:func:`hipMemGetAllocationGranularity` function
+to determine the correct granularity.
+
+.. code-block:: cpp
+
+ size_t granularity = 0;
+ hipMemGenericAllocationHandle_t allocHandle;
+ hipMemAllocationProp prop = {};
+ prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED;
+ prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
+ prop.location.id = currentDev;
+ hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM);
+ padded_size = ROUND_UP(size, granularity);
+ hipMemCreate(&allocHandle, padded_size, &prop, 0);
+
+Reserve virtual address range
+--------------------------------------------------------------------------------
+
+After you have acquired an allocation of physical memory, you must map it before
+you can use it. To do so, you need a virtual address to map it to. Mapping
+means the physical memory allocation is available from the virtual address range
+it is mapped to. To reserve a virtual memory range, use the
+:cpp:func:`hipMemAddressReserve` function. The size of the virtual memory must
+match the amount of physical memory previously allocated. You can then map the
+physical memory allocation to the newly-acquired virtual memory address range
+using the :cpp:func:`hipMemMap` function.
+
+.. code-block:: cpp
+
+ hipMemAddressReserve(&ptr, padded_size, 0, 0, 0);
+ hipMemMap(ptr, padded_size, 0, allocHandle, 0);
+
+Set memory access
+--------------------------------------------------------------------------------
+
+Finally, use the :cpp:func:`hipMemSetAccess` function to enable memory access.
+It accepts the pointer to the virtual memory, the size, and a
+:cpp:struct:`hipMemAccessDesc` descriptor as parameters. In a multi-GPU
+environment, you can map the device memory of one GPU to another. This feature
+also works with the traditional memory management system, but isn't as scalable
+as with virtual memory. When memory is allocated with :cpp:func:`hipMalloc`,
+:cpp:func:`hipDeviceEnablePeerAccess` is used to enable peer access. This
+function enables access between two devices, but it means that every call to
+:cpp:func:`hipMalloc` takes more time to perform the checks and the mapping
+between the devices. When using virtual memory management, peer access is
+enabled by :cpp:func:`hipMemSetAccess`, which provides a finer level of
+control over what is shared. This has no performance impact on memory allocation
+and gives you more control over what memory buffers are shared with which
+devices.
+
+.. code-block:: cpp
+
+ hipMemAccessDesc accessDesc = {};
+ accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
+ accessDesc.location.id = currentDev;
+ accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE;
+ hipMemSetAccess(ptr, padded_size, &accessDesc, 1);
+
+At this point the memory is allocated, mapped, and ready for use. You can read
+and write to it, just like you would a C style memory allocation.
+
+Free virtual memory
+--------------------------------------------------------------------------------
+
+To free the memory allocated in this manner, use the corresponding free
+functions. To unmap the memory, use :cpp:func:`hipMemUnmap`. To release the
+virtual address range, use :cpp:func:`hipMemAddressFree`. Finally, to release
+the physical memory, use :cpp:func:`hipMemRelease`. A side effect of these
+functions is the lack of synchronization when memory is released. If you call
+:cpp:func:`hipFree` when you have multiple streams running in parallel, it
+synchronizes the device. This causes worse resource usage and performance.
+
+.. code-block:: cpp
+
+ hipMemUnmap(ptr, size);
+ hipMemRelease(allocHandle);
+ hipMemAddressFree(ptr, size);
+
+.. _usage_virtual_memory:
+
+Memory usage
+================================================================================
+
+Dynamically increase allocation size
+--------------------------------------------------------------------------------
+
+The :cpp:func:`hipMemAddressReserve` function allows you to increase the amount
+of pre-allocated memory. This function accepts a parameter representing the
+requested starting address of the virtual memory. This allows you to have a
+continuous virtual address space without worrying about the underlying physical
+allocation.
+
+.. code-block:: cpp
+
+ hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0);
+ hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0);
+ hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1);
+
+The code sample above assumes that :cpp:func:`hipMemAddressReserve` was able to
+reserve the memory address at the specified location. However, this isn't
+guaranteed to be true, so you should validate that ``new_ptr`` points to a
+specific virtual address before using it.
diff --git a/docs/how-to/hip_runtime_api/multi_device.rst b/docs/how-to/hip_runtime_api/multi_device.rst
new file mode 100644
index 0000000000..3b79e37dfc
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/multi_device.rst
@@ -0,0 +1,424 @@
+.. meta::
+ :description: This chapter describes how to use multiple devices on one host.
+ :keywords: ROCm, HIP, multi-device, multiple, GPUs, devices
+
+.. _multi-device:
+
+*******************************************************************************
+Multi-device management
+*******************************************************************************
+
+Device enumeration
+===============================================================================
+
+Device enumeration involves identifying all the available GPUs connected to the
+host system. A single host machine can have multiple GPUs, each with its own
+unique identifier. By listing these devices, you can decide which GPU to use
+for computation. The host queries the system to count and list all connected
+GPUs that support the chosen ``HIP_PLATFORM``, ensuring that the application
+can leverage the full computational power available. Typically, applications
+list devices and their properties for deployment planning, and also make
+dynamic selections during runtime to ensure optimal performance.
+
+If the application does not assign a specific GPU, the runtime typically
+assigns one automatically based on factors such as availability, performance,
+memory, and efficiency. As a result, the runtime might choose the least
+utilized GPU, the one best suited for the task, the GPU with adequate memory,
+or the most energy-efficient option.
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ int main()
+ {
+ int deviceCount;
+ hipGetDeviceCount(&deviceCount);
+ std::cout << "Number of devices: " << deviceCount << std::endl;
+
+ for (int deviceId = 0; deviceId < deviceCount; ++deviceId)
+ {
+ hipDeviceProp_t deviceProp;
+ hipGetDeviceProperties(&deviceProp, deviceId);
+ std::cout << "Device " << deviceId << std::endl << " Properties:" << std::endl;
+ std::cout << " Name: " << deviceProp.name << std::endl;
+ std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl;
+ std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock / 1024 << " KiB" << std::endl;
+ std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
+ std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
+ std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
+ std::cout << " Max Threads per Multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
+ std::cout << " Number of Multiprocessors: " << deviceProp.multiProcessorCount << std::endl;
+ std::cout << " Max Threads Dimensions: ["
+ << deviceProp.maxThreadsDim[0] << ", "
+ << deviceProp.maxThreadsDim[1] << ", "
+ << deviceProp.maxThreadsDim[2] << "]" << std::endl;
+ std::cout << " Max Grid Size: ["
+ << deviceProp.maxGridSize[0] << ", "
+ << deviceProp.maxGridSize[1] << ", "
+ << deviceProp.maxGridSize[2] << "]" << std::endl;
+ std::cout << std::endl;
+ }
+
+ return 0;
+ }
+
+.. _multi_device_selection:
+
+Device selection
+===============================================================================
+
+Once you have enumerated the available GPUs, the next step is to select a
+specific device for computation. This involves setting the active GPU that will
+execute subsequent operations. This step is crucial in multi-GPU systems where
+different GPUs might have different capabilities or workloads. By selecting the
+appropriate device, you ensure that the computational tasks are directed to the
+correct GPU, optimizing performance and resource utilization.
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if (status != hipSuccess) { \
+ std::cerr << "HIP error " << status \
+ << ": " << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ exit(status); \
+ } \
+ }
+
+ __global__ void simpleKernel(double *data)
+ {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ data[idx] = idx * 2.0;
+ }
+
+ int main()
+ {
+ double* deviceData0;
+ double* deviceData1;
+ size_t size = 1024 * sizeof(*deviceData0);
+
+ int deviceId0 = 0;
+ int deviceId1 = 1;
+
+ // Set device 0 and perform operations
+ HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
+ HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
+ simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Set device 1 and perform operations
+ HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
+ HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
+ simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Copy result from device 0
+ double hostData0[1024];
+ HIP_CHECK(hipSetDevice(deviceId0));
+ HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
+
+ // Copy result from device 1
+ double hostData1[1024];
+ HIP_CHECK(hipSetDevice(deviceId1));
+ HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
+
+ // Display results from both devices
+ std::cout << "Device 0 data: " << hostData0[0] << std::endl;
+ std::cout << "Device 1 data: " << hostData1[0] << std::endl;
+
+ // Free device memory
+ HIP_CHECK(hipFree(deviceData0));
+ HIP_CHECK(hipFree(deviceData1));
+
+ return 0;
+ }
+
+
+Stream and event behavior
+===============================================================================
+
+In a multi-device system, streams and events are essential for efficient
+parallel computation and synchronization. Streams enable asynchronous task
+execution, allowing multiple devices to process data concurrently without
+blocking one another. Events provide a mechanism for synchronizing operations
+across streams and devices, ensuring that tasks on one device are completed
+before dependent tasks on another device begin. This coordination prevents race
+conditions and optimizes data flow in multi-GPU systems. Together, streams and
+events maximize performance by enabling parallel execution, load balancing, and
+effective resource utilization across heterogeneous hardware.
+
+.. code-block:: cpp
+
+ #include
+ #include
+
+ __global__ void simpleKernel(double *data)
+ {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ data[idx] = idx * 2.0;
+ }
+
+ int main()
+ {
+ int numDevices;
+ hipGetDeviceCount(&numDevices);
+
+ if (numDevices < 2) {
+ std::cerr << "This example requires at least two GPUs." << std::endl;
+ return -1;
+ }
+
+ double *deviceData0, *deviceData1;
+ size_t size = 1024 * sizeof(*deviceData0);
+
+ // Create streams and events for each device
+ hipStream_t stream0, stream1;
+ hipEvent_t startEvent0, stopEvent0, startEvent1, stopEvent1;
+
+ // Initialize device 0
+ hipSetDevice(0);
+ hipStreamCreate(&stream0);
+ hipEventCreate(&startEvent0);
+ hipEventCreate(&stopEvent0);
+ hipMalloc(&deviceData0, size);
+
+ // Initialize device 1
+ hipSetDevice(1);
+ hipStreamCreate(&stream1);
+ hipEventCreate(&startEvent1);
+ hipEventCreate(&stopEvent1);
+ hipMalloc(&deviceData1, size);
+
+ // Record the start event on device 0
+ hipSetDevice(0);
+ hipEventRecord(startEvent0, stream0);
+
+ // Launch the kernel asynchronously on device 0
+ simpleKernel<<<1000, 128, 0, stream0>>>(deviceData0);
+
+ // Record the stop event on device 0
+ hipEventRecord(stopEvent0, stream0);
+
+ // Wait for the stop event on device 0 to complete
+ hipEventSynchronize(stopEvent0);
+
+ // Record the start event on device 1
+ hipSetDevice(1);
+ hipEventRecord(startEvent1, stream1);
+
+ // Launch the kernel asynchronously on device 1
+ simpleKernel<<<1000, 128, 0, stream1>>>(deviceData1);
+
+ // Record the stop event on device 1
+ hipEventRecord(stopEvent1, stream1);
+
+ // Wait for the stop event on device 1 to complete
+ hipEventSynchronize(stopEvent1);
+
+ // Calculate elapsed time between the events for both devices
+ float milliseconds0 = 0, milliseconds1 = 0;
+ hipEventElapsedTime(&milliseconds0, startEvent0, stopEvent0);
+ hipEventElapsedTime(&milliseconds1, startEvent1, stopEvent1);
+
+ std::cout << "Elapsed time on GPU 0: " << milliseconds0 << " ms" << std::endl;
+ std::cout << "Elapsed time on GPU 1: " << milliseconds1 << " ms" << std::endl;
+
+ // Cleanup for device 0
+ hipSetDevice(0);
+ hipEventDestroy(startEvent0);
+ hipEventDestroy(stopEvent0);
+ hipStreamSynchronize(stream0);
+ hipStreamDestroy(stream0);
+ hipFree(deviceData0);
+
+ // Cleanup for device 1
+ hipSetDevice(1);
+ hipEventDestroy(startEvent1);
+ hipEventDestroy(stopEvent1);
+ hipStreamSynchronize(stream1);
+ hipStreamDestroy(stream1);
+ hipFree(deviceData1);
+
+ return 0;
+ }
+
+Peer-to-peer memory access
+===============================================================================
+
+In multi-GPU systems, peer-to-peer memory access enables one GPU to directly
+read or write to the memory of another GPU. This capability reduces data
+transfer times by allowing GPUs to communicate directly without involving the
+host. Enabling peer-to-peer access can significantly improve the performance of
+applications that require frequent data exchange between GPUs, as it eliminates
+the need to transfer data through the host memory.
+
+By adding peer-to-peer access to the example referenced in
+:ref:`multi_device_selection`, data can be copied between devices:
+
+.. tab-set::
+
+ .. tab-item:: with peer-to-peer
+
+ .. code-block:: cpp
+ :emphasize-lines: 31-37, 51-55
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if (status != hipSuccess) { \
+ std::cerr << "HIP error " << status \
+ << ": " << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ exit(status); \
+ } \
+ }
+
+ __global__ void simpleKernel(double *data)
+ {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ data[idx] = idx * 2.0;
+ }
+
+ int main()
+ {
+ double* deviceData0;
+ double* deviceData1;
+ size_t size = 1024 * sizeof(*deviceData0);
+
+ int deviceId0 = 0;
+ int deviceId1 = 1;
+
+ // Enable peer access to the memory (allocated and future) on the peer device.
+ // Ensure the device is active before enabling peer access.
+ hipSetDevice(deviceId0);
+ hipDeviceEnablePeerAccess(deviceId1, 0);
+
+ hipSetDevice(deviceId1);
+ hipDeviceEnablePeerAccess(deviceId0, 0);
+
+ // Set device 0 and perform operations
+ HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
+ HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
+ simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Set device 1 and perform operations
+ HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
+ HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
+ simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Use peer-to-peer access
+ hipSetDevice(deviceId0);
+
+ // Now device 0 can access memory allocated on device 1
+ hipMemcpy(deviceData0, deviceData1, size, hipMemcpyDeviceToDevice);
+
+ // Copy result from device 0
+ double hostData0[1024];
+ HIP_CHECK(hipSetDevice(deviceId0));
+ HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
+
+ // Copy result from device 1
+ double hostData1[1024];
+ HIP_CHECK(hipSetDevice(deviceId1));
+ HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
+
+ // Display results from both devices
+ std::cout << "Device 0 data: " << hostData0[0] << std::endl;
+ std::cout << "Device 1 data: " << hostData1[0] << std::endl;
+
+ // Free device memory
+ HIP_CHECK(hipFree(deviceData0));
+ HIP_CHECK(hipFree(deviceData1));
+
+ return 0;
+ }
+
+ .. tab-item:: without peer-to-peer
+
+ .. code-block:: cpp
+ :emphasize-lines: 43-49, 53, 58
+
+ #include
+ #include
+
+ #define HIP_CHECK(expression) \
+ { \
+ const hipError_t status = expression; \
+ if (status != hipSuccess) { \
+ std::cerr << "HIP error " << status \
+ << ": " << hipGetErrorString(status) \
+ << " at " << __FILE__ << ":" \
+ << __LINE__ << std::endl; \
+ exit(status); \
+ } \
+ }
+
+ __global__ void simpleKernel(double *data)
+ {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ data[idx] = idx * 2.0;
+ }
+
+ int main()
+ {
+ double* deviceData0;
+ double* deviceData1;
+ size_t size = 1024 * sizeof(*deviceData0);
+
+ int deviceId0 = 0;
+ int deviceId1 = 1;
+
+ // Set device 0 and perform operations
+ HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
+ HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
+ simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Set device 1 and perform operations
+ HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
+ HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
+ simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
+ HIP_CHECK(hipDeviceSynchronize());
+
+ // Attempt to use deviceData0 on device 1 (This will not work as deviceData0 is allocated on device 0)
+ HIP_CHECK(hipSetDevice(deviceId1));
+ hipError_t err = hipMemcpy(deviceData1, deviceData0, size, hipMemcpyDeviceToDevice); // This should fail
+ if (err != hipSuccess)
+ {
+ std::cout << "Error: Cannot access deviceData0 from device 1, deviceData0 is on device 0" << std::endl;
+ }
+
+ // Copy result from device 0
+ double hostData0[1024];
+ HIP_CHECK(hipSetDevice(deviceId0));
+ HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
+
+ // Copy result from device 1
+ double hostData1[1024];
+ HIP_CHECK(hipSetDevice(deviceId1));
+ HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
+
+ // Display results from both devices
+ std::cout << "Device 0 data: " << hostData0[0] << std::endl;
+ std::cout << "Device 1 data: " << hostData1[0] << std::endl;
+
+ // Free device memory
+ HIP_CHECK(hipFree(deviceData0));
+ HIP_CHECK(hipFree(deviceData1));
+
+ return 0;
+ }
\ No newline at end of file
diff --git a/docs/how-to/hip_runtime_api/opengl_interop.rst b/docs/how-to/hip_runtime_api/opengl_interop.rst
new file mode 100644
index 0000000000..45f34de257
--- /dev/null
+++ b/docs/how-to/hip_runtime_api/opengl_interop.rst
@@ -0,0 +1,94 @@
+.. meta::
+ :description: HIP provides an OpenGL interoperability API that allows
+ efficient data sharing between HIP's computing power and
+ OpenGL's graphics rendering.
+ :keywords: AMD, ROCm, HIP, OpenGL, interop, interoperability
+
+*******************************************************************************
+OpenGL interoperability
+*******************************************************************************
+
+The HIP--OpenGL interoperation involves mapping OpenGL resources, such as
+buffers and textures, for HIP to interact with OpenGL. This mapping process
+enables HIP to utilize these resources directly, bypassing the need for costly
+data transfers between the CPU and GPU. This capability is useful in
+applications that require both intensive GPU computation and real-time
+visualization.
+
+The graphics resources must be registered using functions like
+:cpp:func:`hipGraphicsGLRegisterBuffer` or :cpp:func:`hipGraphicsGLRegisterImage`
+then they can be mapped to HIP with :cpp:func:`hipGraphicsMapResources`
+function.
+
+After mapping, the :cpp:func:`hipGraphicsResourceGetMappedPointer` or
+:cpp:func:`hipGraphicsSubResourceGetMappedArray` functions used to retrieve a
+device pointer to the mapped resource, which can then be used in HIP kernels.
+
+Unmapping resources with :cpp:func:`hipGraphicsUnmapResources` after
+computations ensure proper resource management.
+
+Example
+===============================================================================
+
+ROCm examples have a `HIP--OpenGL interoperation example `_,
+where a simple HIP kernel is used to simulate a sine wave and rendered to a
+window as a grid of triangles using OpenGL. For a working example, there are
+multiple initialization steps needed like creating and opening a window,
+initializing OpenGL or selecting the OpenGL-capable device. After the
+initialization in the example, the kernel simulates the sinewave and updates
+the window's framebuffer in a cycle until the window is closed.
+
+.. note::
+
+ The more recent OpenGL functions are loaded with `OpenGL loader `_,
+ as these are not loaded by default on all platforms. The use of a custom
+ loader is shown in the following example
+
+ ..
+
+ .. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+ :start-after: // [Sphinx opengl functions load start]
+ :end-before: // [Sphinx opengl functions load end]
+ :language: cpp
+
+ ..
+
+The OpenGL buffer is imported to HIP in the following way:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+ :start-after: // [Sphinx buffer register and get start]
+ :end-before: // [Sphinx buffer register and get end]
+ :language: cpp
+
+..
+
+The imported pointer is manipulated in the sinewave kernel as shown in the
+following example:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+ :start-after: /// [Sphinx sinewave kernel start]
+ :end-before: /// [Sphinx sinewave kernel end]
+ :language: cpp
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+ :start-after: // [Sphinx buffer use in kernel start]
+ :end-before: // [Sphinx buffer use in kernel end]
+ :language: cpp
+
+..
+
+The HIP graphics resource that is imported from the OpenGL buffer and is not
+needed anymore should be unmapped and unregistered as shown in the following way:
+
+..
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+ :start-after: // [Sphinx unregister start]
+ :end-before: // [Sphinx unregister end]
+ :language: cpp
+
+..
diff --git a/docs/how-to/logging.rst b/docs/how-to/logging.rst
index 4a97332f1e..ecf40fa192 100644
--- a/docs/how-to/logging.rst
+++ b/docs/how-to/logging.rst
@@ -33,11 +33,12 @@ The value of this variable controls your logging level. Levels are defined as fo
.. code-block:: cpp
enum LogLevel {
- LOG_NONE = 0,
- LOG_ERROR = 1,
- LOG_WARNING = 2,
- LOG_INFO = 3,
- LOG_DEBUG = 4
+ LOG_NONE = 0,
+ LOG_ERROR = 1,
+ LOG_WARNING = 2,
+ LOG_INFO = 3,
+ LOG_DEBUG = 4,
+ LOG_EXTRA_DEBUG = 5
};
.. tip::
@@ -55,26 +56,27 @@ change this to any of the valid values:
.. code-block:: cpp
enum LogMask {
- LOG_API = 0x00000001, //!< API call
- LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
- LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish
- LOG_AQL = 0x00000008, //!< Decode and display AQL packets
- LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents
- LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool
- LOG_LOCK = 0x00000040, //!< Locks and thread-safety code.
- LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc.
- LOG_COPY = 0x00000100, //!< Copy debug
- LOG_COPY2 = 0x00000200, //!< Detailed copy debug
- LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events.
- LOG_INIT = 0x00000800, //!< Initialization and shutdown
- LOG_MISC = 0x00001000, //!< misc debug, not yet classified
- LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet
- LOG_CODE = 0x00004000, //!< Show code creation debug
- LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands
- LOG_LOCATION = 0x00010000, //!< Log message location
- LOG_MEM = 0x00020000, //!< Memory allocation
- LOG_MEM_POOL = 0x00040000, //!< Memory pool allocation, including memory in graphs
- LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero
+ LOG_API = 1, //!< (0x1) API call
+ LOG_CMD = 2, //!< (0x2) Kernel and Copy Commands and Barriers
+ LOG_WAIT = 4, //!< (0x4) Synchronization and waiting for commands to finish
+ LOG_AQL = 8, //!< (0x8) Decode and display AQL packets
+ LOG_QUEUE = 16, //!< (0x10) Queue commands and queue contents
+ LOG_SIG = 32, //!< (0x20) Signal creation, allocation, pool
+ LOG_LOCK = 64, //!< (0x40) Locks and thread-safety code.
+ LOG_KERN = 128, //!< (0x80) Kernel creations and arguments, etc.
+ LOG_COPY = 256, //!< (0x100) Copy debug
+ LOG_COPY2 = 512, //!< (0x200) Detailed copy debug
+ LOG_RESOURCE = 1024, //!< (0x400) Resource allocation, performance-impacting events.
+ LOG_INIT = 2048, //!< (0x800) Initialization and shutdown
+ LOG_MISC = 4096, //!< (0x1000) Misc debug, not yet classified
+ LOG_AQL2 = 8192, //!< (0x2000) Show raw bytes of AQL packet
+ LOG_CODE = 16384, //!< (0x4000) Show code creation debug
+ LOG_CMD2 = 32768, //!< (0x8000) More detailed command info, including barrier commands
+ LOG_LOCATION = 65536, //!< (0x10000) Log message location
+ LOG_MEM = 131072, //!< (0x20000) Memory allocation
+ LOG_MEM_POOL = 262144, //!< (0x40000) Memory pool allocation, including memory in graphs
+ LOG_TS = 524288, //!< (0x80000) Timestamp details
+ LOG_ALWAYS = -1 //!< (0xFFFFFFFF) Log always even mask flag is zero
};
You can also define the logging mask via the ``AMD_LOG_MASK`` environment variable.
diff --git a/docs/how-to/performance_guidelines.rst b/docs/how-to/performance_guidelines.rst
index 9ebd210106..bf74b63d16 100644
--- a/docs/how-to/performance_guidelines.rst
+++ b/docs/how-to/performance_guidelines.rst
@@ -41,7 +41,7 @@ the host or parallel to the devices.
For parallel workloads, when threads belonging to the same block need to
synchronize to share data, use :cpp:func:`__syncthreads()` (see:
-:ref:`synchronization functions`) within the same kernel invocation. For threads
+:ref:`synchronization_functions`) within the same kernel invocation. For threads
belonging to different blocks, use global memory with two separate
kernel invocations. It is recommended to avoid the latter approach as it adds
overhead.
diff --git a/docs/how-to/programming_manual.md b/docs/how-to/programming_manual.md
deleted file mode 100644
index bac20c9996..0000000000
--- a/docs/how-to/programming_manual.md
+++ /dev/null
@@ -1,212 +0,0 @@
-# HIP programming manual
-
-## Host Memory
-
-### Introduction
-
-`hipHostMalloc` allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`.
-There are two use cases for this host memory:
-
-* Faster `HostToDevice` and `DeviceToHost` Data Transfers:
-The runtime tracks the `hipHostMalloc` allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with `--unpinned` and `--pinned` switches for the `hipBusBandwidth` tool.
-* Zero-Copy GPU Access:
-GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data. This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory. Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once). Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired.
-
-### Memory allocation flags
-
-There are flags parameter which can specify options how to allocate the memory, for example,
-`hipHostMallocPortable`, the memory is considered allocated by all contexts, not just the one on which the allocation is made.
-`hipHostMallocMapped`, will map the allocation into the address space for the current device, and the device pointer can be obtained with the API `hipHostGetDevicePointer()`.
-`hipHostMallocNumaUser` is the flag to allow host memory allocation to follow Numa policy by user. Please note this flag is currently only applicable on Linux, under development on Windows.
-
-All allocation flags are independent, and can be used in any combination without restriction, for instance, `hipHostMalloc` can be called with both `hipHostMallocPortable` and `hipHostMallocMapped` flags set. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory.
-
-### Numa-aware host memory allocation
-
-Numa policy determines how memory is allocated.
-Target of Numa policy is to select a CPU that is closest to each GPU.
-Numa distance is the measurement of how far between GPU and CPU devices.
-
-By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using `hipSetDevice` API to a different GPU will still be able to access the host allocation, but can have longer Numa distance.
-Note, Numa policy is so far implemented on Linux, and under development on Windows.
-
-### Coherency Controls
-
-ROCm defines two coherency options for host memory:
-
-* Coherent memory : Supports fine-grain synchronization while the kernel is running. For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs. Synchronization instructions include `threadfence_system` and C++11-style atomic operations.
-In order to achieve this fine-grained coherence, many AMD GPUs use a limited cache policy, such as leaving these allocations uncached by the GPU, or making them read-only.
-
-* Non-coherent memory : Can be cached by GPU, but cannot support synchronization while the kernel is running. Non-coherent memory can be optionally synchronized only at command (end-of-kernel or copy command) boundaries. This memory is appropriate for high-performance access when fine-grain synchronization is not required.
-
-HIP provides the developer with controls to select which type of memory is used via allocation flags passed to `hipHostMalloc` and the `HIP_HOST_COHERENT` environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP.
-The control logic in the current version of HIP is as follows:
-
-* No flags are passed in: the host memory allocation is coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocCoherent=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocMapped=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocNonCoherent=1`, `hipHostMallocCoherent=0`, and `hipHostMallocMapped=0`: The host memory will be non-coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocCoherent=0`, `hipHostMallocNonCoherent=0`, `hipHostMallocMapped=0`, but one of the other `HostMalloc` flags is set:
- * If `HIP_HOST_COHERENT` is defined as 1, the host memory allocation is coherent.
- * If `HIP_HOST_COHERENT` is not defined, or defined as 0, the host memory allocation is non-coherent.
-* `hipHostMallocCoherent=1`, `hipHostMallocNonCoherent=1`: Illegal.
-
-### Visibility of Zero-Copy Host Memory
-
-Coherent host memory is automatically visible at synchronization points.
-Non-coherent
-
-| HIP API | Synchronization Effect | Fence | Coherent Host Memory Visibility | Non-Coherent Host Memory Visibility|
-| --- | --- | --- | --- | --- |
-| `hipStreamSynchronize` | host waits for all commands in the specified stream to complete | system-scope release | yes | yes |
-| `hipDeviceSynchronize` | host waits for all commands in all streams on the specified device to complete | system-scope release | yes | yes |
-| `hipEventSynchronize` | host waits for the specified event to complete | device-scope release | yes | depends - see below|
-| `hipStreamWaitEvent` | stream waits for the specified event to complete | none | yes | no |
-
-### `hipEventSynchronize`
-
-Developers can control the release scope for `hipEvents`:
-
-* By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device.
-
-A stronger system-level fence can be specified when the event is created with `hipEventCreateWithFlags`:
-
-* `hipEventReleaseToSystem`: Perform a system-scope release operation when the event is recorded. This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing. Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use `hipEventReleaseToSystem`.
-* `hipEventDisableTiming`: Events created with this flag will not record profiling data and provide the best performance if used for synchronization.
-
-### Summary and Recommendations
-
-* Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as `threadfence_system` to work transparently.
-* HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
-
-### Managed memory allocation
-
-Managed memory, including the `__managed__` keyword, is supported in HIP combined host/device compilation, on Linux, not on Windows (under development).
-
-Managed memory, via unified memory allocation, allows data be shared and accessible to both the CPU and GPU using a single pointer.
-The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism, the user can call managed memory API `hipMallocManaged` to allocate a large chunk of HMM memory, execute kernels on device and fetch data between the host and device as needed.
-
-In HIP application, it is recommended to do the capability check before calling the managed memory APIs. For example:
-
-```cpp
-int managed_memory = 0;
-HIPCHECK(hipDeviceGetAttribute(&managed_memory,
- hipDeviceAttributeManagedMemory,p_gpuDevice));
-
-if (!managed_memory ) {
- printf ("info: managed memory access not supported on the device %d\n Skipped\n", p_gpuDevice);
-}
-else {
- HIPCHECK(hipSetDevice(p_gpuDevice));
- HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
-. . .
-}
-```
-
-Please note, the managed memory capability check may not be necessary, but if HMM is not supported, then managed malloc will fall back to using system memory and other managed memory API calls will have undefined behavior.
-
-Note, managed memory management is implemented on Linux, not supported on Windows yet.
-
-### HIP Stream Memory Operations
-
-HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added,
- `hipStreamWaitValue32`
- `hipStreamWaitValue64`
- `hipStreamWriteValue32`
- `hipStreamWriteValue64`
-
-Note, CPU access to the semaphore's memory requires volatile keyword to disable CPU compiler's optimizations on memory access.
-For more details, please check the documentation `HIP-API.pdf`.
-
-Please note, HIP stream does not guarantee concurrency on AMD hardware for the case of multiple (at least 6) long-running streams executing concurrently, using `hipStreamSynchronize(nullptr)` for synchronization.
-
-## Direct Dispatch
-
-HIP runtime has Direct Dispatch enabled by default in ROCM 4.4 on Linux.
-With this feature we move away from our conventional producer-consumer model where the runtime creates a worker thread(consumer) for each HIP Stream, and the host thread(producer) enqueues commands to a command queue(per stream).
-
-For Direct Dispatch, HIP runtime would directly enqueue a packet to the AQL queue (user mode queue on GPU) on the Dispatch API call from the application. That has shown to reduce the latency to launch the first wave on the idle GPU and total time of tiny dispatches synchronized with the host.
-
-In addition, eliminating the threads in runtime has reduced the variance in the dispatch numbers as the thread scheduling delays and atomics/locks synchronization latencies are reduced.
-
-This feature can be disabled via setting the following environment variable,
-AMD_DIRECT_DISPATCH=0
-
-Note, Direct Dispatch is implemented on Linux. It is currently not supported on Windows.
-
-## HIP Runtime Compilation
-
-HIP now supports runtime compilation (HIP RTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation.
-
-HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes.
-
-For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](../doxygen/html/index).
-
-For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available.
-
-## HIP Graph
-
-HIP graphs are supported. For more details, refer to the [HIP API Guide](../doxygen/html/group___graph) or the [how-to section for HIP graphs](../how-to/hipgraph).
-
-## Device-Side Malloc
-
-HIP-Clang now supports device-side malloc and free.
-This implementation does not require the use of `hipDeviceSetLimit(hipLimitMallocHeapSize,value)` nor respects any setting. The heap is fully dynamic and can grow until the available free memory on the device is consumed.
-
-## Use of Per-thread default stream
-
-The per-thread default stream is supported in HIP. It is an implicit stream local to both the thread and the current device. This means that the command issued to the per-thread default stream by the thread does not implicitly synchronize with other streams (like explicitly created streams), or default per-thread stream on other threads.
-The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
-The per-thread default stream can be enabled via adding a compilation option,
-`-fgpu-default-stream=per-thread`.
-
-And users can explicitly use `hipStreamPerThread` as per-thread default stream handle as input in API commands. There are test codes as examples in the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/streamperthread).
-
-## Use of Long Double Type
-
-In HIP-Clang, long double type is 80-bit extended precision format for x86_64, which is not supported by AMDGPU. HIP-Clang treats long double type as IEEE double type for AMDGPU. Using long double type in HIP source code will not cause issue as long as data of long double type is not transferred between host and device. However, long double type should not be used as kernel argument type.
-
-## Use of `_Float16` Type
-
-If a host function is to be used between clang (or hipcc) and gcc for x86_64, i.e. its definition is compiled by one compiler but the caller is compiled by a different compiler, `_Float16` or aggregates containing `_Float16` should not be used as function argument or return type. This is due to lack of stable ABI for `_Float16` on x86_64. Passing `_Float16` or aggregates containing `_Float16` between clang and gcc could cause undefined behavior.
-
-## FMA and contractions
-
-By default HIP-Clang assumes `-ffp-contract=fast-honor-pragmas`.
-Users can use `#pragma clang fp contract(on|off|fast)` to control `fp` contraction of a block of code.
-For x86_64, FMA is off by default since the generic x86_64 target does not
-support FMA by default. To turn on FMA on x86_64, either use `-mfma` or `-march=native`
-on CPU's supporting FMA.
-
-When contractions are enabled and the CPU has not enabled FMA instructions, the
-GPU can produce different numerical results than the CPU for expressions that
-can be contracted. Tolerance should be used for floating point comparisons.
-
-## Math functions with special rounding modes
-
-Note: Currently, HIP only supports basic math functions with rounding modern (round to nearest). HIP does not support basic math functions with rounding modes `ru` (round up), `rd` (round down), and `rz` (round towards zero).
-
-## Creating Static Libraries
-
-HIP-Clang supports generating two types of static libraries. The first type of static library does not export device functions, and only exports and launches host functions within the same library. The advantage of this type is the ability to link with a non-hipcc compiler such as gcc. The second type exports device functions to be linked by other code objects. However, this requires using hipcc as the linker.
-
-In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using `ar`.
-
-Here is an example to create and use static libraries:
-
-* Type 1 using `--emit-static-lib`:
-
- ```cpp
- hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
- gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
- ```
-
-* Type 2 using system `ar`:
-
- ```cpp
- hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
- ar rcsD libHipDevice.a hipDevice.o
- hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
- ```
-
-For more information, please see [HIP samples host functions](https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/host_functions) and [device_functions](https://github.com/ROCm/hip-tests/tree/rocm-5.5.x/samples/2_Cookbook/15_static_library/device_functions).
diff --git a/docs/how-to/unified_memory.rst b/docs/how-to/unified_memory.rst
deleted file mode 100644
index f64189454c..0000000000
--- a/docs/how-to/unified_memory.rst
+++ /dev/null
@@ -1,577 +0,0 @@
-.. meta::
- :description: This chapter describes introduces Unified Memory (UM) and shows
- how to use it in AMD HIP.
- :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU
-
-*******************************************************************************
-Unified memory
-*******************************************************************************
-
-In conventional architectures, CPUs and GPUs have dedicated memory like Random
-Access Memory (RAM) and Video Random Access Memory (VRAM). This architectural
-design, while effective, can be limiting in terms of memory capacity and
-bandwidth, as continuous memory copying is required to allow the processors to
-access the appropriate data. New architectural features like Heterogeneous
-System Architectures (HSA) and Unified Memory (UM) help avoid these limitations
-and promise increased efficiency and innovation.
-
-Unified memory
-==============
-Unified Memory is a single memory address space accessible from any processor
-within a system. This setup simplifies memory management processes and enables
-applications to allocate data that can be read or written by code running on
-either CPUs or GPUs. The Unified memory model is shown in the following figure.
-
-.. figure:: ../data/unified_memory/um.svg
-
-AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory
-Architecture. On a single die, a central processing unit (CPU) is combined
-with an integrated graphics processing unit (iGPU), and both have access to a
-high-bandwidth memory (HBM) module named Unified Memory. The CPU enables
-high-performance, low-latency operations, while the GPU is optimized for high
-throughput (data processed by unit time).
-
-.. _unified memory system requirements:
-
-System requirements
-===================
-Unified memory is supported on Linux by all modern AMD GPUs from the Vega
-series onward. Unified memory management can be achieved with managed memory
-allocation and, for the latest GPUs, with a system allocator.
-
-The table below lists the supported allocators. The allocators are described in
-the next section.
-
-.. list-table:: Supported Unified Memory Allocators
- :widths: 40, 25, 25, 25
- :header-rows: 1
- :align: center
-
- * - Architecture
- - ``hipMallocManaged()``
- - ``__managed__``
- - ``malloc()``
- * - MI200, MI300 Series
- - ✅
- - ✅
- - ✅ :sup:`1`
- * - MI100
- - ✅
- - ✅
- - ❌
- * - RDNA (Navi) Series
- - ✅
- - ✅
- - ❌
- * - GCN5 (Vega) Series
- - ✅
- - ✅
- - ❌
-
-✅: **Supported**
-
-❌: **Unsupported**
-
-:sup:`1` Works only with ``XNACK=1``. First GPU access causes recoverable
-page-fault. For more details, visit
-`GPU memory `_.
-
-.. _unified memory programming models:
-
-Unified memory programming models
-=================================
-
-Showcasing various unified memory programming models, the model availability
-depends on your architecture. For more information, see :ref:`unified memory
-system requirements` and :ref:`checking unified memory management support`.
-
-- **HIP managed memory allocation API**:
-
- The ``hipMallocManaged()`` is a dynamic memory allocator available on
- all GPUs with unified memory support. For more details, visit
- :ref:`unified_memory_reference`.
-
-- **HIP managed variables**:
-
- The ``__managed__`` declaration specifier, which serves as its counterpart,
- is supported on all modern AMD cards and can be utilized for static
- allocation.
-
-- **System allocation API**:
-
- Starting with the AMD MI300 series, the ``malloc()`` system allocator allows
- you to reserve unified memory. The system allocator is more versatile and
- offers an easy transition from a CPU written C++ code to a HIP code as the
- same system allocation API is used.
-
-.. _checking unified memory management support:
-
-Checking unified memory management support
-------------------------------------------
-Some device attributes can offer information about which :ref:`unified memory
-programming models` are supported. The attribute value is 1 if the
-functionality is supported, and 0 if it is not supported.
-
-.. list-table:: Device attributes for unified memory management
- :widths: 40, 60
- :header-rows: 1
- :align: center
-
- * - attribute
- - description
- * - ``hipDeviceAttributeManagedMemory``
- - unified addressing is supported
- * - ``hipDeviceAttributeConcurrentManagedAccess``
- - full managed memory support, concurrent access is supported
- * - ``hipDeviceAttributePageableMemoryAccess``
- - both managed and system memory allocation API is supported
-
-The following examples show how to use device attributes:
-
-.. code-block:: cpp
-
- #include
- #include
-
- int main() {
- int d;
- hipGetDevice(&d);
-
- int is_cma = 0;
- hipDeviceGetAttribute(&is_cma, hipDeviceAttributeConcurrentManagedAccess, d);
- std::cout << "HIP Managed Memory: "
- << (is_cma == 1 ? "is" : "NOT")
- << " supported" << std::endl;
- return 0;
- }
-
-Example for unified memory management
--------------------------------------
-
-The following example shows how to use unified memory management with
-``hipMallocManaged()``, function, with ``__managed__`` attribute for static
-allocation and standard ``malloc()`` allocation. For comparison, the Explicit
-Memory Management example is presented in the last tab.
-
-.. tab-set::
-
- .. tab-item:: hipMallocManaged()
-
- .. code-block:: cpp
- :emphasize-lines: 12-15
-
- #include
- #include
-
- // Addition of two values.
- __global__ void add(int *a, int *b, int *c) {
- *c = *a + *b;
- }
-
- int main() {
- int *a, *b, *c;
-
- // Allocate memory for a, b and c that is accessible to both device and host codes.
- hipMallocManaged(&a, sizeof(*a));
- hipMallocManaged(&b, sizeof(*b));
- hipMallocManaged(&c, sizeof(*c));
-
- // Setup input values.
- *a = 1;
- *b = 2;
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
- // Wait for GPU to finish before accessing on host.
- hipDeviceSynchronize();
-
- // Prints the result.
- std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
- // Cleanup allocated memory.
- hipFree(a);
- hipFree(b);
- hipFree(c);
-
- return 0;
- }
-
-
- .. tab-item:: __managed__
-
- .. code-block:: cpp
- :emphasize-lines: 9-10
-
- #include
- #include
-
- // Addition of two values.
- __global__ void add(int *a, int *b, int *c) {
- *c = *a + *b;
- }
-
- // Declare a, b and c as static variables.
- __managed__ int a, b, c;
-
- int main() {
- // Setup input values.
- a = 1;
- b = 2;
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
-
- // Wait for GPU to finish before accessing on host.
- hipDeviceSynchronize();
-
- // Prints the result.
- std::cout << a << " + " << b << " = " << c << std::endl;
-
- return 0;
- }
-
-
- .. tab-item:: malloc()
-
- .. code-block:: cpp
- :emphasize-lines: 12-15
-
- #include
- #include
-
- // Addition of two values.
- __global__ void add(int* a, int* b, int* c) {
- *c = *a + *b;
- }
-
- int main() {
- int* a, * b, * c;
-
- // Allocate memory for a, b, and c.
- a = (int*)malloc(sizeof(*a));
- b = (int*)malloc(sizeof(*b));
- c = (int*)malloc(sizeof(*c));
-
- // Setup input values.
- *a = 1;
- *b = 2;
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
- // Wait for GPU to finish before accessing on host.
- hipDeviceSynchronize();
-
- // Prints the result.
- std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
- // Cleanup allocated memory.
- free(a);
- free(b);
- free(c);
-
- return 0;
- }
-
-
- .. tab-item:: Explicit Memory Management
-
- .. code-block:: cpp
- :emphasize-lines: 17-24, 29-30
-
- #include
- #include
-
- // Addition of two values.
- __global__ void add(int *a, int *b, int *c) {
- *c = *a + *b;
- }
-
- int main() {
- int a, b, c;
- int *d_a, *d_b, *d_c;
-
- // Setup input values.
- a = 1;
- b = 2;
-
- // Allocate device copies of a, b and c.
- hipMalloc(&d_a, sizeof(*d_a));
- hipMalloc(&d_b, sizeof(*d_b));
- hipMalloc(&d_c, sizeof(*d_c));
-
- // Copy input values to device.
- hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice);
- hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice);
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
-
- // Copy the result back to the host.
- hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost);
-
- // Cleanup allocated memory.
- hipFree(d_a);
- hipFree(d_b);
- hipFree(d_c);
-
- // Prints the result.
- std::cout << a << " + " << b << " = " << c << std::endl;
-
- return 0;
- }
-
-.. _using unified memory management:
-
-Using unified memory management (UMM)
-=====================================
-
-Unified memory management (UMM) is a feature that can simplify the complexities
-of memory management in GPU computing. It is particularly useful in
-heterogeneous computing environments with heavy memory usage with both a CPU
-and a GPU, which would require large memory transfers. Here are some areas
-where UMM can be beneficial:
-
-- **Simplification of Memory Management**:
-
- UMM can help to simplify the complexities of memory management. This can make
- it easier for developers to write code without worrying about memory
- allocation and deallocation details.
-
-- **Data Migration**:
-
- UMM allows for efficient data migration between the host (CPU) and the device
- (GPU). This can be particularly useful for applications that need to move
- data back and forth between the device and host.
-
-- **Improved Programming Productivity**:
-
- As a positive side effect, UMM can reduce the lines of code, thereby
- improving programming productivity.
-
-In HIP, pinned memory allocations are coherent by default. Pinned memory is
-host memory mapped into the address space of all GPUs, meaning that the pointer
-can be used on both host and device. Using pinned memory instead of pageable
-memory on the host can improve bandwidth.
-
-While UMM can provide numerous benefits, it's important to be aware of the
-potential performance overhead associated with UMM. You must thoroughly test
-and profile your code to ensure it's the most suitable choice for your use
-case.
-
-.. _unified memory runtime hints:
-
-Unified memory HIP runtime hints for the better performance
-===========================================================
-
-Unified memory HIP runtime hints can help improve the performance of your code if
-you know your code's ability and infrastructure. Some hint techniques are
-presented in this section.
-
-The hint functions can set actions on a selected device, which can be
-identified by ``hipGetDeviceProperties(&prop, device_id)``. There are two
-special ``device_id`` values:
-
-- ``hipCpuDeviceId`` = -1 means that the advised device is the CPU.
-- ``hipInvalidDeviceId`` = -2 means that the device is invalid.
-
-For the best performance, profile your application to optimize the
-utilization of HIP runtime hints.
-
-Data prefetching
-----------------
-
-Data prefetching is a technique used to improve the performance of your
-application by moving data closer to the processing unit before it's actually
-needed.
-
-.. code-block:: cpp
- :emphasize-lines: 20-23,31-32
-
- // Addition of two values.
- __global__ void add(int *a, int *b, int *c) {
- *c = *a + *b;
- }
-
- int main() {
- int *a, *b, *c;
- int deviceId;
- hipGetDevice(&deviceId); // Get the current device ID
-
- // Allocate memory for a, b and c that is accessible to both device and host codes.
- hipMallocManaged(&a, sizeof(*a));
- hipMallocManaged(&b, sizeof(*b));
- hipMallocManaged(&c, sizeof(*c));
-
- // Setup input values.
- *a = 1;
- *b = 2;
-
- // Prefetch the data to the GPU device.
- hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0);
- hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0);
- hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0);
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
- // Wait for GPU to finish before accessing on host.
- hipDeviceSynchronize();
-
- // Prefetch the result back to the CPU.
- hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0);
-
- // Wait for the prefetch operations to complete.
- hipDeviceSynchronize();
-
- // Prints the result.
- std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
- // Cleanup allocated memory.
- hipFree(a);
- hipFree(b);
- hipFree(c);
-
- return 0;
- }
-
-Remember to check the return status of ``hipMemPrefetchAsync()`` to ensure that
-the prefetch operations are completed successfully.
-
-Memory advice
--------------
-
-The effectiveness of ``hipMemAdvise()`` comes from its ability to inform the
-runtime system of the developer's intentions regarding memory usage. When the
-runtime system has knowledge of the expected memory access patterns, it can
-make better decisions about data placement and caching, leading to more
-efficient execution of the application. However, the actual impact on
-performance can vary based on the specific use case and the hardware
-architecture.
-
-For the description of ``hipMemAdvise()`` and the detailed list of advice,
-visit the :ref:`unified_memory_reference`.
-
-Here is the updated version of the example above with memory advice.
-
-.. code-block:: cpp
- :emphasize-lines: 17-26
-
- #include
- #include
-
- // Addition of two values.
- __global__ void add(int *a, int *b, int *c) {
- *c = *a + *b;
- }
-
- int main() {
- int *a, *b, *c;
-
- // Allocate memory for a, b, and c accessible to both device and host codes.
- hipMallocManaged(&a, sizeof(*a));
- hipMallocManaged(&b, sizeof(*b));
- hipMallocManaged(&c, sizeof(*c));
-
- // Set memory advice for a, b, and c to be accessed by the CPU.
- hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
- hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
- hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
-
- // Additionally, set memory advice for a, b, and c to be read mostly from the device 0.
- constexpr int device = 0;
- hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, device);
- hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, device);
- hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, device);
-
- // Setup input values.
- *a = 1;
- *b = 2;
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
- // Wait for GPU to finish before accessing on host.
- hipDeviceSynchronize();
-
- // Prints the result.
- std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
- // Cleanup allocated memory.
- hipFree(a);
- hipFree(b);
- hipFree(c);
-
- return 0;
- }
-
-
-Memory range attributes
------------------------
-
-Memory Range attributes allow you to query attributes of a given memory range.
-
-The ``hipMemRangeGetAttribute()`` is added to the example to query the
-``hipMemRangeAttributeReadMostly`` attribute of the memory range pointed to by
-``a``. The result is stored in ``attributeValue`` and then printed out.
-
-For more details, visit the
-:ref:`unified_memory_reference`.
-
-.. code-block:: cpp
- :emphasize-lines: 29-34
-
- #include
- #include
-
- // Addition of two values.
- __global__ void add(int *a, int *b, int *c) {
- *c = *a + *b;
- }
-
- int main() {
- int *a, *b, *c;
- unsigned int attributeValue;
- constexpr size_t attributeSize = sizeof(attributeValue);
-
- // Allocate memory for a, b and c that is accessible to both device and host codes.
- hipMallocManaged(&a, sizeof(*a));
- hipMallocManaged(&b, sizeof(*b));
- hipMallocManaged(&c, sizeof(*c));
-
- // Setup input values.
- *a = 1;
- *b = 2;
-
- // Launch add() kernel on GPU.
- hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
- // Wait for GPU to finish before accessing on host.
- hipDeviceSynchronize();
-
- // Query an attribute of the memory range.
- hipMemRangeGetAttribute(&attributeValue,
- attributeSize,
- hipMemRangeAttributeReadMostly,
- a,
- sizeof(*a));
-
- // Prints the result.
- std::cout << *a << " + " << *b << " = " << *c << std::endl;
- std::cout << "The queried attribute value is: " << attributeValue << std::endl;
-
- // Cleanup allocated memory.
- hipFree(a);
- hipFree(b);
- hipFree(c);
-
- return 0;
- }
-
-Asynchronously attach memory to a stream
-----------------------------------------
-
-The ``hipStreamAttachMemAsync`` function would be able to asynchronously attach memory to a stream, which can help concurrent execution when using streams.
-
-Currently, this function is a no-operation (NOP) function on AMD GPUs. It simply returns success after the runtime memory validation passed. This function is necessary on Microsoft Windows, and UMM is not supported on this operating system with AMD GPUs at the moment.
diff --git a/docs/how-to/virtual_memory.rst b/docs/how-to/virtual_memory.rst
deleted file mode 100644
index 3e56bfb4fe..0000000000
--- a/docs/how-to/virtual_memory.rst
+++ /dev/null
@@ -1,94 +0,0 @@
-.. meta::
- :description: This chapter describes introduces Virtual Memory (VM) and shows
- how to use it in AMD HIP.
- :keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU
-
-.. _virtual_memory:
-
-*****************************
-Virtual memory management
-*****************************
-
-Memory management is important when creating high-performance applications in the HIP ecosystem. Both allocating and copying memory can result in bottlenecks, which can significantly impact performance.
-
-Global memory allocation in HIP uses the C language style allocation function. This works fine for simple cases but can cause problems if your memory needs change. If you need to increase the size of your memory, you must allocate a second larger buffer and copy the data to it before you can free the original buffer. This increases overall memory usage and causes unnecessary ``memcpy`` calls. Another solution is to allocate a larger buffer than you initially need. However, this isn't an efficient way to handle resources and doesn't solve the issue of reallocation when the extra buffer runs out.
-
-Virtual memory management solves these memory management problems. It helps to reduce memory usage and unnecessary ``memcpy`` calls.
-
-.. _memory_allocation_virtual_memory:
-
-Memory allocation
-=================
-
-Standard memory allocation uses the ``hipMalloc`` function to allocate a block of memory on the device. However, when using virtual memory, this process is separated into multiple steps using the ``hipMemCreate``, ``hipMemAddressReserve``, ``hipMemMap``, and ``hipMemSetAccess`` functions. This guide explains what these functions do and how you can use them for virtual memory management.
-
-Allocate physical memory
-------------------------
-
-The first step is to allocate the physical memory itself with the ``hipMemCreate`` function. This function accepts the size of the buffer, an ``unsigned long long`` variable for the flags, and a ``hipMemAllocationProp`` variable. ``hipMemAllocationProp`` contains the properties of the memory to be allocated, such as where the memory is physically located and what kind of shareable handles are available. If the allocation is successful, the function returns a value of ``hipSuccess``, with ``hipMemGenericAllocationHandle_t`` representing a valid physical memory allocation. The allocated memory size must be aligned with the granularity appropriate for the properties of the allocation. You can use the ``hipMemGetAllocationGranularity`` function to determine the correct granularity.
-
-.. code-block:: cpp
-
- size_t granularity = 0;
- hipMemGenericAllocationHandle_t allocHandle;
- hipMemAllocationProp prop = {};
- prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED;
- prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
- prop.location.id = currentDev;
- hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM);
- padded_size = ROUND_UP(size, granularity);
- hipMemCreate(&allocHandle, padded_size, &prop, 0);
-
-Reserve virtual address range
------------------------------
-
-After you have acquired an allocation of physical memory, you must map it before you can use it. To do so, you need a virtual address to map it to. Mapping means the physical memory allocation is available from the virtual address range it is mapped to. To reserve a virtual memory range, use the ``hipMemAddressReserve`` function. The size of the virtual memory must match the amount of physical memory previously allocated. You can then map the physical memory allocation to the newly-acquired virtual memory address range using the ``hipMemMap`` function.
-
-.. code-block:: cpp
-
- hipMemAddressReserve(&ptr, padded_size, 0, 0, 0);
- hipMemMap(ptr, padded_size, 0, allocHandle, 0);
-
-Set memory access
------------------
-
-Finally, use the ``hipMemSetAccess`` function to enable memory access. It accepts the pointer to the virtual memory, the size, and a ``hipMemAccessDesc`` descriptor as parameters. In a multi-GPU environment, you can map the device memory of one GPU to another. This feature also works with the traditional memory management system, but isn't as scalable as with virtual memory. When memory is allocated with ``hipMalloc``, ``hipDeviceEnablePeerAccess`` is used to enable peer access. This function enables access between two devices, but it means that every call to ``hipMalloc`` takes more time to perform the checks and the mapping between the devices. When using virtual memory management, peer access is enabled by ``hipMemSetAccess``, which provides a finer level of control over what is shared. This has no performance impact on memory allocation and gives you more control over what memory buffers are shared with which devices.
-
-.. code-block:: cpp
-
- hipMemAccessDesc accessDesc = {};
- accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
- accessDesc.location.id = currentDev;
- accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE;
- hipMemSetAccess(ptr, padded_size, &accessDesc, 1);
-
-At this point the memory is allocated, mapped, and ready for use. You can read and write to it, just like you would a C style memory allocation.
-
-Free virtual memory
--------------------
-
-To free the memory allocated in this manner, use the corresponding free functions. To unmap the memory, use ``hipMemUnmap``. To release the virtual address range, use ``hipMemAddressFree``. Finally, to release the physical memory, use ``hipMemRelease``. A side effect of these functions is the lack of synchronization when memory is released. If you call ``hipFree`` when you have multiple streams running in parallel, it synchronizes the device. This causes worse resource usage and performance.
-
-.. code-block:: cpp
-
- hipMemUnmap(ptr, size);
- hipMemRelease(allocHandle);
- hipMemAddressFree(ptr, size);
-
-.. _usage_virtual_memory:
-
-Memory usage
-============
-
-Dynamically increase allocation size
-------------------------------------
-
-The ``hipMemAddressReserve`` function allows you to increase the amount of pre-allocated memory. This function accepts a parameter representing the requested starting address of the virtual memory. This allows you to have a continuous virtual address space without worrying about the underlying physical allocation.
-
-.. code-block:: cpp
-
- hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0);
- hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0);
- hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1);
-
-The code sample above assumes that ``hipMemAddressReserve`` was able to reserve the memory address at the specified location. However, this isn't guaranteed to be true, so you should validate that ``new_ptr`` points to a specific virtual address before using it.
diff --git a/docs/index.md b/docs/index.md
index e62da5fa73..7b3f3bc513 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,71 +1,54 @@
-# HIP documentation
-
-The Heterogeneous-computing Interface for Portability (HIP) API is a C++ runtime
-API and kernel language that lets developers create portable applications for AMD
-and NVIDIA GPUs from single source code.
-
-For HIP supported AMD GPUs on multiple operating systems, see:
-
-* [Linux system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus)
-* [Microsoft Windows system requirements](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)
-
-The CUDA enabled NVIDIA GPUs are supported by HIP. For more information, see [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
-
-On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Compute Language Runtimes (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows,
+
+
+
+
+
-On non-AMD platforms, like NVIDIA, HIP provides header files required to support non-AMD specific back-end implementation in the repository ['hipother'](https://github.com/ROCm/hipother), which translates from the HIP runtime APIs to CUDA runtime APIs.
-
-## Overview
+# HIP documentation
-::::{grid} 1 1 2 2
-:gutter: 3
+The Heterogeneous-computing Interface for Portability (HIP) is a C++ runtime API
+and kernel language that lets you create portable applications for AMD and
+NVIDIA GPUs from a single source code. For more information, see [What is HIP?](./what_is_hip)
-:::{grid-item-card} Install
+Installation instructions are available from:
* [Installing HIP](./install/install)
* [Building HIP from source](./install/build)
-:::
+The HIP documentation is organized into the following categories:
+
+::::{grid} 1 2 2 2
+:gutter: 3
-:::{grid-item-card} Conceptual
+:::{grid-item-card} Programming guide
+* [Introduction](./programming_guide)
* {doc}`./understand/programming_model`
* {doc}`./understand/hardware_implementation`
-* {doc}`./understand/amd_clr`
-* {doc}`./understand/texture_fetching`
-
-:::
-
-:::{grid-item-card} How to
-
-* [Programming manual](./how-to/programming_manual)
-* [HIP porting guide](./how-to/hip_porting_guide)
-* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
-* {doc}`./how-to/hip_rtc`
+* {doc}`./understand/compilers`
* {doc}`./how-to/performance_guidelines`
* [Debugging with HIP](./how-to/debugging)
* {doc}`./how-to/logging`
-* [Unified memory](./how-to/unified_memory)
-* [Virtual memory](./how-to/virtual_memory)
-* {doc}`./how-to/stream_ordered_allocator`
-* [Cooperative groups](./how-to/cooperative_groups)
-* [HIP graphs](./how-to/hipgraph)
-* {doc}`./how-to/faq`
+* {doc}`./how-to/hip_runtime_api`
+* [HIP porting guide](./how-to/hip_porting_guide)
+* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
+* {doc}`./how-to/hip_rtc`
+* {doc}`./understand/amd_clr`
:::
:::{grid-item-card} Reference
* [HIP runtime API](./reference/hip_runtime_api_reference)
- * [Modules](./reference/hip_runtime_api/modules)
- * [Global defines, enums, structs and files](./reference/hip_runtime_api/global_defines_enums_structs_files)
* [HSA runtime API for ROCm](./reference/virtual_rocr)
* [C++ language extensions](./reference/cpp_language_extensions)
* [C++ language support](./reference/cpp_language_support)
* [HIP math API](./reference/math_api)
+* [HIP environment variables](./reference/env_variables)
* [Comparing syntax for different APIs](./reference/terms)
* [List of deprecated APIs](./reference/deprecated_api_list)
* [FP8 numbers in HIP](./reference/fp8_numbers)
+* {doc}`./reference/hardware_features`
:::
diff --git a/docs/install/build.rst b/docs/install/build.rst
index 013297962f..4f8f8bf505 100644
--- a/docs/install/build.rst
+++ b/docs/install/build.rst
@@ -1,3 +1,7 @@
+.. meta::
+ :description: This page gives instructions on how to build HIP from source.
+ :keywords: AMD, ROCm, HIP, build, build instructions, source
+
*******************************************
Build HIP from source
*******************************************
diff --git a/docs/install/install.rst b/docs/install/install.rst
index afacee1258..a5b174a1dd 100644
--- a/docs/install/install.rst
+++ b/docs/install/install.rst
@@ -1,12 +1,20 @@
+.. meta::
+ :description: This page explains how to install HIP
+ :keywords: AMD, ROCm, HIP, install, installation
+
*******************************************
Install HIP
*******************************************
HIP can be installed on AMD (ROCm with HIP-Clang) and NVIDIA (CUDA with NVCC) platforms.
-Note: The version definition for the HIP runtime is different from CUDA. On an AMD platform, the
-``hipRuntimeGerVersion`` function returns the HIP runtime version; on an NVIDIA platform, this function
-returns the CUDA runtime version.
+.. note::
+ The version definition for the HIP runtime is different from CUDA. On AMD
+ platforms, the :cpp:func:`hipRuntimeGetVersion` function returns the HIP
+ runtime version. On NVIDIA platforms, this function returns the CUDA runtime
+ version.
+
+.. _install_prerequisites:
Prerequisites
=======================================
@@ -24,8 +32,9 @@ Prerequisites
.. tab-item:: NVIDIA
:sync: nvidia
- Check the system requirements in the
- `NVIDIA CUDA Installation Guide `_.
+ With NVIDIA GPUs, HIP requires unified memory. All CUDA-enabled NVIDIA
+ GPUs with compute capability 5.0 or later should be supported. For more
+ information, see `NVIDIA's list of CUDA enabled GPUs `_.
Installation
=======================================
diff --git a/docs/programming_guide.rst b/docs/programming_guide.rst
new file mode 100644
index 0000000000..4e63f53eaa
--- /dev/null
+++ b/docs/programming_guide.rst
@@ -0,0 +1,84 @@
+.. meta::
+ :description: HIP programming guide introduction
+ :keywords: HIP programming guide introduction, HIP programming guide
+
+.. _hip-programming-guide:
+
+********************************************************************************
+HIP programming guide introduction
+********************************************************************************
+
+This topic provides key HIP programming concepts and links to more detailed
+information.
+
+Write GPU Kernels for Parallel Execution
+================================================================================
+
+To make the most of the parallelism inherent to GPUs, a thorough understanding
+of the :ref:`programming model ` is helpful. The HIP
+programming model is designed to make it easy to map data-parallel algorithms to
+architecture of the GPUs. HIP employs the SIMT-model (Single
+Instruction Multiple Threads) with a multi-layered thread hierarchy for
+efficient execution.
+
+Understand the Target Architecture (CPU and GPU)
+================================================================================
+
+The :ref:`hardware implementation ` topic outlines the
+GPUs supported by HIP. In general, GPUs are made up of Compute Units that excel
+at executing parallelizable, computationally intensive workloads without complex
+control-flow.
+
+Increase parallelism on multiple level
+================================================================================
+
+To maximize performance and keep all system components fully utilized, the
+application should expose and efficiently manage as much parallelism as possible.
+:ref:`Parallel execution ` can be achieved at the
+application, device, and multiprocessor levels.
+
+The application’s host and device operations can achieve parallel execution
+through asynchronous calls, streams, or HIP graphs. On the device level,
+multiple kernels can execute concurrently when resources are available, and at
+the multiprocessor level, developers can overlap data transfers with
+computations to further optimize performance.
+
+Memory management
+================================================================================
+
+GPUs generally have their own distinct memory, also called :ref:`device
+memory `, separate from the :ref:`host memory `.
+Device memory needs to be managed separately from the host memory. This includes
+allocating the memory and transfering it between the host and the device. These
+operations can be performance critical, so it's important to know how to use
+them effectively. For more information, see :ref:`Memory management `.
+
+Synchronize CPU and GPU Workloads
+================================================================================
+
+Tasks on the host and devices run asynchronously, so proper synchronization is
+needed when dependencies between those tasks exist. The asynchronous execution
+of tasks is useful for fully utilizing the available resources. Even when only a
+single device is available, memory transfers and the execution of tasks can be
+overlapped with asynchronous execution.
+
+Error Handling
+================================================================================
+
+All functions in the HIP runtime API return an error value of type
+:cpp:enum:`hipError_t` that can be used to verify whether the function was
+successfully executed. It's important to confirm these
+returned values, in order to catch and handle those errors, if possible.
+An exception is kernel launches, which don't return any value. These
+errors can be caught with specific functions like :cpp:func:`hipGetLastError()`.
+
+For more information, see :ref:`error_handling` .
+
+Multi-GPU and Load Balancing
+================================================================================
+
+Large-scale applications that need more compute power can use multiple GPUs in
+the system. This requires distributing workloads across multiple GPUs to balance
+the load to prevent GPUs from being overutilized while others are idle.
+
+For more information, see :ref:`multi-device` .
\ No newline at end of file
diff --git a/docs/reference/cpp_language_extensions.rst b/docs/reference/cpp_language_extensions.rst
index c0b804c552..243b6ae08e 100644
--- a/docs/reference/cpp_language_extensions.rst
+++ b/docs/reference/cpp_language_extensions.rst
@@ -293,6 +293,7 @@ dimensions to 1.
dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
};
+.. _memory_fence_instructions:
Memory fence instructions
====================================================
@@ -306,7 +307,7 @@ HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using
``hipHostMalloc()``.
* Remove ``memcpy`` for all allocated fine-grained system memory regions.
-.. _synchronization functions:
+.. _synchronization_functions:
Synchronization functions
====================================================
@@ -376,6 +377,8 @@ To read a high-resolution timer from the device, HIP provides the following buil
Note that ``clock()`` and ``clock64()`` do not work properly on AMD RDNA3 (GFX11) graphic processors.
+.. _atomic functions:
+
Atomic functions
===============================================
@@ -734,6 +737,8 @@ will be enabled unconditionally in the next ROCm release. Wherever possible, the
implementation includes a static assert to check that the program source uses
the correct type for the mask.
+.. _warp_vote_functions:
+
Warp vote and ballot functions
-------------------------------------------------------------------------------------------------------------
diff --git a/docs/reference/deprecated_api_list.rst b/docs/reference/deprecated_api_list.rst
index f73297e41d..a21643478c 100644
--- a/docs/reference/deprecated_api_list.rst
+++ b/docs/reference/deprecated_api_list.rst
@@ -6,87 +6,171 @@
HIP deprecated runtime API functions
**********************************************************************************************
-Several of our API functions have been flagged for deprecation. Using the following functions results in
-errors and unexpected results, so we encourage you to update your code accordingly.
+Several of our API functions have been flagged for deprecation. Using the
+following functions results in errors and unexpected results, so we encourage
+you to update your code accordingly.
-Context management
+Deprecated since ROCm 6.1.0
============================================================
-CUDA supports cuCtx API, which is the driver API that defines "Context" and "Devices" as separate
-entities. Context contains a single device, and a device can theoretically have multiple contexts. HIP
-initially added limited support for these APIs in order to facilitate porting from existing driver codes.
-These APIs are now marked as deprecated because there are better alternate interfaces (such as
-``hipSetDevice`` or the stream API) to achieve these functions.
-
-* ``hipCtxCreate``
-* ``hipCtxDestroy``
-* ``hipCtxPopCurrent``
-* ``hipCtxPushCurrent``
-* ``hipCtxSetCurrent``
-* ``hipCtxGetCurrent``
-* ``hipCtxGetDevice``
-* ``hipCtxGetApiVersion``
-* ``hipCtxGetCacheConfig``
-* ``hipCtxSetCacheConfig``
-* ``hipCtxSetSharedMemConfig``
-* ``hipCtxGetSharedMemConfig``
-* ``hipCtxSynchronize``
-* ``hipCtxGetFlags``
-* ``hipCtxEnablePeerAccess``
-* ``hipCtxDisablePeerAccess``
-* ``hipDevicePrimaryCtxGetState``
-* ``hipDevicePrimaryCtxRelease``
-* ``hipDevicePrimaryCtxRetain``
-* ``hipDevicePrimaryCtxReset``
-* ``hipDevicePrimaryCtxSetFlags``
-
-Memory management
+Deprecated texture management functions.
+
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
+
+ * - function
+ * - :cpp:func:`hipTexRefGetBorderColor`
+ * - :cpp:func:`hipTexRefGetArray`
+
+Deprecated since ROCm 5.7.0
+============================================================
+
+Deprecated texture management functions.
+
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
+
+ * - function
+ * - :cpp:func:`hipBindTextureToMipmappedArray`
+
+Deprecated since ROCm 5.3.0
+============================================================
+
+Deprecated texture management functions.
+
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
+
+ * - function
+ * - :cpp:func:`hipGetTextureReference`
+ * - :cpp:func:`hipTexRefSetAddressMode`
+ * - :cpp:func:`hipTexRefSetArray`
+ * - :cpp:func:`hipTexRefSetFlags`
+ * - :cpp:func:`hipTexRefSetFilterMode`
+ * - :cpp:func:`hipTexRefSetFormat`
+ * - :cpp:func:`hipTexRefSetMipmapFilterMode`
+ * - :cpp:func:`hipTexRefSetMipmapLevelBias`
+ * - :cpp:func:`hipTexRefSetMipmapLevelClamp`
+ * - :cpp:func:`hipTexRefSetMipmappedArray`
+
+Deprecated since ROCm 4.3.0
+============================================================
+
+Deprecated texture management functions.
+
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
+
+ * - function
+ * - :cpp:func:`hipTexRefGetAddress`
+ * - :cpp:func:`hipTexRefGetAddressMode`
+ * - :cpp:func:`hipTexRefGetFilterMode`
+ * - :cpp:func:`hipTexRefGetFlags`
+ * - :cpp:func:`hipTexRefGetFormat`
+ * - :cpp:func:`hipTexRefGetMaxAnisotropy`
+ * - :cpp:func:`hipTexRefGetMipmapFilterMode`
+ * - :cpp:func:`hipTexRefGetMipmapLevelBias`
+ * - :cpp:func:`hipTexRefGetMipmapLevelClamp`
+ * - :cpp:func:`hipTexRefGetMipMappedArray`
+ * - :cpp:func:`hipTexRefSetAddress`
+ * - :cpp:func:`hipTexRefSetAddress2D`
+ * - :cpp:func:`hipTexRefSetBorderColor`
+ * - :cpp:func:`hipTexRefSetMaxAnisotropy`
+
+Deprecated since ROCm 3.8.0
+============================================================
+
+Deprecated memory management and texture management functions.
+
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
+
+ * - function
+ * - :cpp:func:`hipBindTexture`
+ * - :cpp:func:`hipBindTexture2D`
+ * - :cpp:func:`hipBindTextureToArray`
+ * - :cpp:func:`hipGetTextureAlignmentOffset`
+ * - :cpp:func:`hipUnbindTexture`
+ * - :cpp:func:`hipMemcpyToArray`
+ * - :cpp:func:`hipMemcpyFromArray`
+
+Deprecated since ROCm 3.1.0
============================================================
-* ``hipMallocHost`` (replaced with ``hipHostMalloc``)
-* ``hipMemAllocHost`` (replaced with ``hipHostMalloc``)
-* ``hipMemcpyToArray``
-* ``hipMemcpyFromArray``
+Deprecated memory management functions.
+
+.. list-table::
+ :widths: 40, 60
+ :header-rows: 1
+ :align: left
+
+ * - function
+ -
+ * - :cpp:func:`hipMallocHost`
+ - replaced with :cpp:func:`hipHostAlloc`
+ * - :cpp:func:`hipMemAllocHost`
+ - replaced with :cpp:func:`hipHostAlloc`
-Profiler control
+Deprecated since ROCm 3.0.0
============================================================
-* ``hipProfilerStart`` (use roctracer/rocTX)
-* ``hipProfilerStop`` (use roctracer/rocTX)
+The ``hipProfilerStart`` and ``hipProfilerStop`` functions are deprecated.
+Instead, you can use ``roctracer`` or ``rocTX`` for profiling which provide more
+flexibility and detailed profiling capabilities.
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
-Texture management
+ * - function
+ * - :cpp:func:`hipProfilerStart`
+ * - :cpp:func:`hipProfilerStop`
+
+Deprecated since ROCm 1.9.0
============================================================
-* ``hipGetTextureReference``
-* ``hipTexRefSetAddressMode``
-* ``hipTexRefSetArray``
-* ``hipTexRefSetFilterMode``
-* ``hipTexRefSetFlags``
-* ``hipTexRefSetFormat``
-* ``hipTexRefGetAddress``
-* ``hipTexRefGetAddressMode``
-* ``hipTexRefGetFilterMode``
-* ``hipTexRefGetFlags``
-* ``hipTexRefGetFormat``
-* ``hipTexRefGetMaxAnisotropy``
-* ``hipTexRefGetMipmapFilterMode``
-* ``hipTexRefGetMipmapLevelBias``
-* ``hipTexRefGetMipmapLevelClamp``
-* ``hipTexRefGetMipMappedArray``
-* ``hipTexRefSetAddress``
-* ``hipTexRefSetAddress2D``
-* ``hipTexRefSetMaxAnisotropy``
-* ``hipTexRefSetBorderColor``
-* ``hipTexRefSetMipmapFilterMode``
-* ``hipTexRefSetMipmapLevelBias``
-* ``hipTexRefSetMipmapLevelClamp``
-* ``hipTexRefSetMipmappedArray``
-* ``hipTexRefGetBorderColor``
-* ``hipTexRefGetArray``
-* ``hipBindTexture``
-* ``hipBindTexture2D``
-* ``hipBindTextureToArray``
-* ``hipGetTextureAlignmentOffset``
-* ``hipUnbindTexture``
-* ``hipBindTextureToMipmappedArray``
+CUDA supports cuCtx API, which is the driver API that defines "Context" and
+"Devices" as separate entities. Context contains a single device, and a device
+can theoretically have multiple contexts. HIP initially added limited support
+for context APIs in order to facilitate porting from existing driver codes. These
+APIs are now marked as deprecated because there are better alternate interfaces
+(such as ``hipSetDevice`` or the stream API) to achieve these functions.
+
+.. list-table::
+ :widths: 40
+ :header-rows: 1
+ :align: left
+
+ * - function
+ * - :cpp:func:`hipCtxCreate`
+ * - :cpp:func:`hipCtxDestroy`
+ * - :cpp:func:`hipCtxPopCurrent`
+ * - :cpp:func:`hipCtxPushCurrent`
+ * - :cpp:func:`hipCtxSetCurrent`
+ * - :cpp:func:`hipCtxGetCurrent`
+ * - :cpp:func:`hipCtxGetDevice`
+ * - :cpp:func:`hipCtxGetApiVersion`
+ * - :cpp:func:`hipCtxGetCacheConfig`
+ * - :cpp:func:`hipCtxSetCacheConfig`
+ * - :cpp:func:`hipCtxSetSharedMemConfig`
+ * - :cpp:func:`hipCtxGetSharedMemConfig`
+ * - :cpp:func:`hipCtxSynchronize`
+ * - :cpp:func:`hipCtxGetFlags`
+ * - :cpp:func:`hipCtxEnablePeerAccess`
+ * - :cpp:func:`hipCtxDisablePeerAccess`
+ * - :cpp:func:`hipDevicePrimaryCtxGetState`
+ * - :cpp:func:`hipDevicePrimaryCtxRelease`
+ * - :cpp:func:`hipDevicePrimaryCtxRetain`
+ * - :cpp:func:`hipDevicePrimaryCtxReset`
+ * - :cpp:func:`hipDevicePrimaryCtxSetFlags`
diff --git a/docs/reference/env_variables.rst b/docs/reference/env_variables.rst
new file mode 100644
index 0000000000..a2e504470c
--- /dev/null
+++ b/docs/reference/env_variables.rst
@@ -0,0 +1,189 @@
+.. meta::
+ :description: HIP environment variables reference
+ :keywords: AMD, HIP, environment variables, environment, reference
+
+********************************************************************************
+HIP environment variables
+********************************************************************************
+
+In this section, the reader can find all the important HIP environment variables
+on AMD platform, which are grouped by functionality.
+
+GPU isolation variables
+================================================================================
+
+The GPU isolation environment variables in HIP are collected in the next table.
+For more information, check :doc:`GPU isolation page `.
+
+.. list-table::
+ :header-rows: 1
+ :widths: 70,30
+
+ * - **Environment variable**
+ - **Value**
+
+ * - | ``ROCR_VISIBLE_DEVICES``
+ | A list of device indices or UUIDs that will be exposed to applications.
+ - Example: ``0,GPU-DEADBEEFDEADBEEF``
+
+ * - | ``GPU_DEVICE_ORDINAL``
+ | Devices indices exposed to OpenCL and HIP applications.
+ - Example: ``0,2``
+
+ * - | ``HIP_VISIBLE_DEVICES`` or ``CUDA_VISIBLE_DEVICES``
+ | Device indices exposed to HIP applications.
+ - Example: ``0,2``
+
+Profiling variables
+================================================================================
+
+The profiling environment variables in HIP are collected in the next table. For
+more information, check :doc:`setting the number of CUs page `.
+
+.. list-table::
+ :header-rows: 1
+ :widths: 70,30
+
+ * - **Environment variable**
+ - **Value**
+
+ * - | ``HSA_CU_MASK``
+ | Sets the mask on a lower level of queue creation in the driver,
+ | this mask will also be set for queues being profiled.
+ - Example: ``1:0-8``
+
+ * - | ``ROC_GLOBAL_CU_MASK``
+ | Sets the mask on queues created by the HIP or the OpenCL runtimes,
+ | this mask will also be set for queues being profiled.
+ - Example: ``0xf``, enables only 4 CUs
+
+ * - | ``HIP_FORCE_QUEUE_PROFILING``
+ | Used to run the app as if it were run in rocprof. Forces command queue
+ | profiling on by default.
+ - | 0: Disable
+ | 1: Enable
+
+Debug variables
+================================================================================
+
+The debugging environment variables in HIP are collected in the next table. For
+more information, check :ref:`debugging_with_hip`.
+
+.. include:: ../how-to/debugging_env.rst
+
+Memory management related variables
+================================================================================
+
+The memory management related environment variables in HIP are collected in the
+next table.
+
+.. list-table::
+ :header-rows: 1
+ :widths: 35,14,51
+
+ * - **Environment variable**
+ - **Default value**
+ - **Value**
+
+ * - | ``HIP_HIDDEN_FREE_MEM``
+ | Amount of memory to hide from the free memory reported by hipMemGetInfo.
+ - ``0``
+ - | 0: Disable
+ | Unit: megabyte (MB)
+
+ * - | ``HIP_HOST_COHERENT``
+ | Specifies if the memory is coherent between the host and GPU in ``hipHostMalloc``.
+ - ``0``
+ - | 0: Memory is not coherent.
+ | 1: Memory is coherent.
+ | Environment variable has effect, if the following conditions are statisfied:
+ | - One of the ``hipHostMallocDefault``, ``hipHostMallocPortable``, ``hipHostMallocWriteCombined`` or ``hipHostMallocNumaUser`` flag set to 1.
+ | - ``hipHostMallocCoherent``, ``hipHostMallocNonCoherent`` and ``hipHostMallocMapped`` flags set to 0.
+
+ * - | ``HIP_INITIAL_DM_SIZE``
+ | Set initial heap size for device malloc.
+ - ``8388608``
+ - | Unit: Byte
+ | The default value corresponds to 8 MB.
+
+ * - | ``HIP_MEM_POOL_SUPPORT``
+ | Enables memory pool support in HIP.
+ - ``0``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``HIP_MEM_POOL_USE_VM``
+ | Enables memory pool support in HIP.
+ - | ``0``: other OS
+ | ``1``: Windows
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``HIP_VMEM_MANAGE_SUPPORT``
+ | Virtual Memory Management Support.
+ - ``1``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``GPU_MAX_HEAP_SIZE``
+ | Set maximum size of the GPU heap to % of board memory.
+ - ``100``
+ - | Unit: Percentage
+
+ * - | ``GPU_MAX_REMOTE_MEM_SIZE``
+ | Maximum size that allows device memory substitution with system.
+ - ``2``
+ - | Unit: kilobyte (KB)
+
+ * - | ``GPU_NUM_MEM_DEPENDENCY``
+ | Number of memory objects for dependency tracking.
+ - ``256``
+ -
+
+ * - | ``GPU_STREAMOPS_CP_WAIT``
+ | Force the stream memory operation to wait on CP.
+ - ``0``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``HSA_LOCAL_MEMORY_ENABLE``
+ | Enable HSA device local memory usage.
+ - ``1``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``PAL_ALWAYS_RESIDENT``
+ | Force memory resources to become resident at allocation time.
+ - ``0``
+ - | 0: Disable
+ | 1: Enable
+
+ * - | ``PAL_PREPINNED_MEMORY_SIZE``
+ | Size of prepinned memory.
+ - ``64``
+ - | Unit: kilobyte (KB)
+
+ * - | ``REMOTE_ALLOC``
+ | Use remote memory for the global heap allocation.
+ - ``0``
+ - | 0: Disable
+ | 1: Enable
+
+Other useful variables
+================================================================================
+
+The following table lists environment variables that are useful but relate to
+different features.
+
+.. list-table::
+ :header-rows: 1
+ :widths: 35,14,51
+
+ * - **Environment variable**
+ - **Default value**
+ - **Value**
+
+ * - | ``HIPRTC_COMPILE_OPTIONS_APPEND``
+ | Sets compile options needed for ``hiprtc`` compilation.
+ - None
+ - ``--gpu-architecture=gfx906:sramecc+:xnack``, ``-fgpu-rdc``
diff --git a/docs/reference/hardware_features.rst b/docs/reference/hardware_features.rst
new file mode 100644
index 0000000000..7ec9ec329e
--- /dev/null
+++ b/docs/reference/hardware_features.rst
@@ -0,0 +1,249 @@
+.. meta::
+ :description: This chapter describes the hardware features of the different hardware architectures.
+ :keywords: AMD, ROCm, HIP, hardware, hardware features, hardware architectures
+
+*******************************************************************************
+Hardware features
+*******************************************************************************
+
+This page gives an overview of the different hardware architectures and the
+features they implement. Hardware features do not imply performance, that
+depends on the specifications found in the :doc:`rocm:reference/gpu-arch-specs`
+page.
+
+ .. list-table::
+ :header-rows: 1
+ :name: hardware-features-table
+
+ *
+ - Hardware feature support
+ - RDNA1
+ - CDNA1
+ - RDNA2
+ - CDNA2
+ - RDNA3
+ - CDNA3
+ *
+ - :ref:`atomic functions` on 32-bit integer values in global and shared memory
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - Atomic functions on 64-bit integer values in global and shared memory
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - Atomic addition on 32-bit floating point values in global and shared memory
+ - ❌
+ - ❌
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - Atomic addition on 64-bit floating point values in global memory and shared memory
+ - ❌
+ - ❌
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - :ref:`Warp vote functions `
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - :ref:`Memory fence instructions `
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - :ref:`Synchronization functions `
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - :ref:`Surface functions `
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - :ref:`float16 half precision IEEE-conformant floating-point operations`
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - :ref:`bfloat16 16-bit floating-point operations`
+ - ❌
+ - ✅
+ - ❌
+ - ✅
+ - ✅
+ - ✅
+ *
+ - Support for :ref:`8-bit floating-point types `
+ - ❌
+ - ❌
+ - ❌
+ - ❌
+ - ❌
+ - ✅
+ *
+ - Support for :ref:`tensor float32 `
+ - ❌
+ - ❌
+ - ❌
+ - ❌
+ - ❌
+ - ✅
+ *
+ - Packed math with 16-bit floating point values
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - Packed math with 32-bit floating point values
+ - ❌
+ - ❌
+ - ❌
+ - ✅
+ - ❌
+ - ✅
+ *
+ - Matrix Cores
+ - ❌
+ - ✅
+ - ❌
+ - ✅
+ - ❌
+ - ✅
+ *
+ - On-Chip Error Correcting Code (ECC)
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ - ✅
+ *
+ - Maximum dimensionality of grid
+ - 3
+ - 3
+ - 3
+ - 3
+ - 3
+ - 3
+ *
+ - Maximum x-, y- or z-dimension of a grid
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ *
+ - Maximum number of threads per grid
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ - :math:`2^{32} - 1`
+ *
+ - Maximum x-, y- or z-dimension of a block
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ *
+ - Maximum number of threads per block
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ - :math:`1024`
+ *
+ - Wavefront size
+ - 32 [1]_
+ - 64
+ - 32 [1]_
+ - 64
+ - 32 [1]_
+ - 64
+ *
+ - Maximum number of resident blocks per compute unit
+ - 40 [1]_
+ - 32
+ - 32 [1]_
+ - 32
+ - 32 [1]_
+ - 32
+ *
+ - Maximum number of resident wavefronts per compute unit
+ - 40 [1]_
+ - 32
+ - 32 [1]_
+ - 32
+ - 32 [1]_
+ - 32
+ *
+ - Maximum number of resident threads per compute unit
+ - 1280 [2]_
+ - 2048
+ - 1024 [2]_
+ - 2048
+ - 1024 [2]_
+ - 2048
+ *
+ - Maximum number of 32-bit vector registers per thread
+ - 256
+ - 256 (vector) + 256 (matrix)
+ - 256
+ - 256 (vector) + 256 (matrix)
+ - 256
+ - 256 (vector) + 256 (matrix)
+ *
+ - Maximum number of 32-bit scalar accumulation registers per thread
+ - 106
+ - 104
+ - 106
+ - 104
+ - 106
+ - 104
+
+.. [1] RDNA architectures have a configurable wavefront size. The native
+ wavefront size is 32, but they can run in "CU mode", which has an effective
+ wavefront size of 64. This affects the number of resident wavefronts and
+ blocks per compute Unit.
+.. [2] RDNA architectures expand the concept of the traditional compute unit
+ with the so-called work group processor, which effectively includes two
+ compute units, within which all threads can cooperate.
diff --git a/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst b/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst
index 60236e5169..84282427f0 100644
--- a/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst
+++ b/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst
@@ -11,5 +11,5 @@ The structs, define macros, enums and files in the HIP runtime API.
* :ref:`global_enum_defines_reference`
* :ref:`driver_types_reference`
-* :doc:`hip:doxygen/html/annotated`
-* :doc:`hip:doxygen/html/files`
+* :doc:`../../doxygen/html/annotated`
+* :doc:`../../doxygen/html/files`
diff --git a/docs/reference/hip_runtime_api/modules.rst b/docs/reference/hip_runtime_api/modules.rst
index 375d1abcd7..9a976ac9e8 100644
--- a/docs/reference/hip_runtime_api/modules.rst
+++ b/docs/reference/hip_runtime_api/modules.rst
@@ -37,6 +37,5 @@ The API is organized into modules based on functionality.
* :ref:`runtime_compilation_reference`
* :ref:`callback_activity_apis_reference`
* :ref:`graph_management_reference`
-* :ref:`opengl_interoperability_reference`
* :ref:`graphics_interoperability_reference`
* :ref:`cooperative_groups_reference`
diff --git a/docs/reference/hip_runtime_api/modules/opengl_interoperability.rst b/docs/reference/hip_runtime_api/modules/opengl_interoperability.rst
deleted file mode 100644
index d90a790fbf..0000000000
--- a/docs/reference/hip_runtime_api/modules/opengl_interoperability.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. meta::
- :description: The OpenGL interoperability reference page.
- :keywords: AMD, ROCm, HIP, CUDA, OpenGL interoperability, OpenGL interop
-
-.. _opengl_interoperability_reference:
-
-*******************************************************************************
-OpenGL interoperability
-*******************************************************************************
-
-.. doxygengroup:: GL
- :content-only:
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 56abaee03f..5d69918b99 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -5,6 +5,9 @@ defaults:
maxdepth: 6
root: index
subtrees:
+- entries:
+ - file: what_is_hip
+ - file: faq
- caption: Install
entries:
@@ -12,33 +15,50 @@ subtrees:
title: Installing HIP
- file: install/build
title: Building HIP from source
+ - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/reference/system-requirements.html
+ title: Linux supported AMD GPUs
+ - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
+ title: Windows supported AMD GPUs
+ - url: https://developer.nvidia.com/cuda-gpus
+ title: NVIDIA supported GPUs
-- caption: Conceptual
+- caption: Programming guide
entries:
+ - file: programming_guide
+ title: Introduction
- file: understand/programming_model
- file: understand/hardware_implementation
- - file: understand/amd_clr
- - file: understand/texture_fetching
- title: Texture fetching
-
-- caption: How to
- entries:
- - file: how-to/programming_manual
- - file: how-to/hip_porting_guide
- - file: how-to/hip_porting_driver_api
- - file: how-to/hip_rtc
+ - file: understand/compilers
- file: how-to/performance_guidelines
- file: how-to/debugging
- file: how-to/logging
- - file: how-to/cooperative_groups
- - file: how-to/unified_memory
- title: Unified memory
- - file: how-to/virtual_memory
- title: Virtual memory
- - file: how-to/stream_ordered_allocator
- - file: how-to/hipgraph
- title: HIP graphs
- - file: how-to/faq
+ - file: how-to/hip_runtime_api
+ subtrees:
+ - entries:
+ - file: how-to/hip_runtime_api/initialization
+ - file: how-to/hip_runtime_api/memory_management
+ subtrees:
+ - entries:
+ - file: how-to/hip_runtime_api/memory_management/host_memory
+ - file: how-to/hip_runtime_api/memory_management/device_memory
+ subtrees:
+ - entries:
+ - file: how-to/hip_runtime_api/memory_management/device_memory/texture_fetching
+ - file: how-to/hip_runtime_api/memory_management/coherence_control
+ - file: how-to/hip_runtime_api/memory_management/unified_memory
+ - file: how-to/hip_runtime_api/memory_management/virtual_memory
+ - file: how-to/hip_runtime_api/memory_management/stream_ordered_allocator
+ - file: how-to/hip_runtime_api/error_handling
+ - file: how-to/hip_runtime_api/cooperative_groups
+ - file: how-to/hip_runtime_api/hipgraph
+ - file: how-to/hip_runtime_api/call_stack
+ - file: how-to/hip_runtime_api/multi_device
+ - file: how-to/hip_runtime_api/opengl_interop
+ - file: how-to/hip_runtime_api/external_interop
+ - file: how-to/hip_porting_guide
+ - file: how-to/hip_porting_driver_api
+ - file: how-to/hip_rtc
+ - file: understand/amd_clr
- caption: Reference
entries:
@@ -75,7 +95,7 @@ subtrees:
- file: reference/hip_runtime_api/modules/runtime_compilation
- file: reference/hip_runtime_api/modules/callback_activity_apis
- file: reference/hip_runtime_api/modules/graph_management
- - file: reference/hip_runtime_api/modules/opengl_interoperability
+ - file: reference/hip_runtime_api/modules/graphics_interoperability
- file: reference/hip_runtime_api/modules/cooperative_groups_reference
- file: reference/hip_runtime_api/global_defines_enums_structs_files
subtrees:
@@ -90,12 +110,14 @@ subtrees:
- file: reference/cpp_language_support
title: C++ language support
- file: reference/math_api
+ - file: reference/env_variables
- file: reference/terms
title: Comparing syntax for different APIs
- file: reference/deprecated_api_list
title: List of deprecated APIs
- file: reference/fp8_numbers
title: FP8 numbers in HIP
+ - file: reference/hardware_features
- caption: Tutorials
entries:
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 0dd2727603..0a0f69fc45 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.7.2
+rocm-docs-core[api_reference]==1.10.0
sphinxcontrib.doxylink
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index d9978e5045..b23b89a21d 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -116,7 +116,7 @@ requests==2.32.3
# via
# pygithub
# sphinx
-rocm-docs-core[api-reference]==1.7.2
+rocm-docs-core[api-reference]==1.10.0
# via -r requirements.in
six==1.16.0
# via python-dateutil
diff --git a/docs/tools/example_codes/external_interop.hip b/docs/tools/example_codes/external_interop.hip
new file mode 100644
index 0000000000..a037170809
--- /dev/null
+++ b/docs/tools/example_codes/external_interop.hip
@@ -0,0 +1,1376 @@
+// MIT License
+//
+// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "example_utils.hpp"
+#include "vulkan_utils.hpp"
+
+#include "nvidia_hip_fix.hpp"
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include "sinewave.frag.spv.h"
+#include "sinewave.vert.spv.h"
+
+// Currently hip external semaphores are not working under Linux when
+// compiling for the AMD platform.
+// TODO: Remove once this is implemented in hipamd.
+// See https://github.com/ROCm-Developer-Tools/hipamd/issues/48.
+#ifndef USE_EXTERNAL_SEMAPHORES
+ #if defined(__HIP_PLATFORM_AMD__) && !defined(_WIN64)
+ #define USE_EXTERNAL_SEMAPHORES 0
+ #else
+ #define USE_EXTERNAL_SEMAPHORES 1
+ #endif
+#endif
+
+// Currently it seems like waiting on an external semaphore that is signaled
+// from hip is not working under windows
+#ifndef USE_SIGNAL_SEMAPHORE
+ #if defined(__HIP_PLATFORM_AMD__) && defined(_WIN64)
+ #define USE_SIGNAL_SEMAPHORE 0
+
+ #else
+ #define USE_SIGNAL_SEMAPHORE 1
+ #endif
+#endif
+
+/// \brief The maximum number of frames that can be rendered at the same time. By
+/// setting this value to more than one, we can allow the presentation engine to
+/// draw the rendered frame to the monitor while we already render the next frame
+/// in the background.
+constexpr size_t max_frames_in_flight = 2;
+
+/// \brief Time maximum time (in nanoseconds) that we are willing to wait on the next
+/// image from the swapchain.
+constexpr uint64_t frame_timeout = std::numeric_limits::max();
+
+/// \brief The number of triangles that the example's grid is in width.
+constexpr uint32_t grid_width = 256;
+/// \brief The number of triangles that the example's grid is in height.
+constexpr uint32_t grid_height = 256;
+
+/// \brief The Vulkan instance extensions required for sharing HIP- and Vulkan
+/// types. \p VK_KHR_external_memory_capabilities is required to share buffers, and
+/// \p VK_KHR_external_semaphore_capabilities is required to share semaphores.
+/// \p VK_KHR_get_physical_device_properties2 is required for the other two, as well
+/// as for querying the device's UUID.
+constexpr const char* required_instance_extensions[] = {
+ VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+};
+
+/// \brief The general Vulkan extensions that a particular device needs to support in order
+/// for it to be able to run this example.
+/// \p VK_KHR_swapchain is required in order to draw to the example's window, and \p VK_KHR_external_memory
+/// and \p VK_KHR_external_semaphore are required to share memory and semaphores respectively with HIP.
+constexpr const char* required_device_extensions[]
+ = {VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
+#ifdef _WIN64
+ VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME};
+#else
+ VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME};
+#endif
+
+/// \brief This structure represents a device UUID, obtained either from Vulkan or
+/// from HIP.
+struct uuid
+{
+ uint8_t bytes[VK_UUID_SIZE];
+
+ /// \brief This function fetches a Vulkan-compatible device UUID from a HIP device.
+ ///
+ /// The use of this function should actually be replaced by \p hipDeviceGetUuid. However,
+ /// on AMD it returns a device UUID that is not compatible with that returned by Vulkan, and
+ /// when compiling for NVIDIA it yields a linker error. For this reason we provide our own
+ /// implementation that is compatible with both the Mesa (RADV) and AMD (AMDVLK) implementations
+ /// of Vulkan on AMD, and call into the CUDA API directly when compiling for NVIDIA.
+ static uuid get_hip_device_uuid(hipDevice_t device)
+ {
+#if defined(__HIP_PLATFORM_AMD__)
+ // The value that hipDeviceGetUuid returns does not correspond with those returned
+ // by mesa (see https://gitlab.freedesktop.org/mesa/mesa/-/blob/5cd3e395037250946ba2519600836341df02c8ca/src/amd/common/ac_gpu_info.c#L1366-1382)
+ // and by xgl (see https://github.com/GPUOpen-Drivers/xgl/blob/4118707939c2f4783d28ce2a383184a3794ca477/icd/api/vk_physical_device.cpp#L4363-L4421)
+ // Those drivers _do_ align with each other, so we can create our own UUID here.
+ // \see https://github.com/ROCm-Developer-Tools/hipamd/issues/50.
+ hipDeviceProp_t props;
+ HIP_CHECK(hipGetDeviceProperties(&props, device));
+
+ struct uuid result = {};
+ uint32_t* uuid_ints = reinterpret_cast(result.bytes);
+ uuid_ints[0] = props.pciDomainID;
+ uuid_ints[1] = props.pciBusID;
+ uuid_ints[2] = props.pciDeviceID;
+ // Note: function is 0 anyway.
+
+ return result;
+#elif defined(__HIP_PLATFORM_NVIDIA__)
+ // Work around a compile error related to hipDeviceGetUuid when compiling for NVIDIA:
+ // "undefined reference to `cuDeviceGetUuid'"
+ // \see https://github.com/ROCm-Developer-Tools/hipamd/issues/51.
+ cudaDeviceProp props;
+ HIP_CHECK(hipCUDAErrorTohipError(cudaGetDeviceProperties(&props, device)));
+
+ struct uuid result = {};
+ std::memcpy(result.bytes, props.uuid.bytes, VK_UUID_SIZE);
+
+ return result;
+#else
+ #error unsupported platform
+#endif
+ }
+};
+
+/// \brief \p std::ostream print operator overload for \p uuid.
+/// \see uuid.
+std::ostream& operator<<(std::ostream& os, const uuid uuid)
+{
+ for(size_t i = 0; i < VK_UUID_SIZE * 2; ++i)
+ {
+ // Extract the current nibble.
+ const uint8_t c = (uuid.bytes[i / 2] >> (4 - (i % 2) * 4)) & 0xF;
+ os << static_cast(c < 10 ? c + '0' : c + 'a' - 10);
+ if(i == 8 || i == 12 || i == 16 || i == 20)
+ {
+ os << '-';
+ }
+ }
+ return os;
+}
+
+/// \brief This structure represents a candidate HIP-device that we can use
+/// for this example.
+struct hip_device_candidate
+{
+ /// The HIP device index representing this device.
+ hipDevice_t device;
+ /// The Vulkan-compatible device UUID.
+ uuid device_uuid;
+};
+
+/// \brief This structure represents a candidate device that we can use for this
+/// example.
+struct physical_device_candidate
+{
+ /// The Vulkan physical device handle of the device to be used.
+ VkPhysicalDevice pdev;
+
+ /// The candidate device's Vulkan device properties.
+ VkPhysicalDeviceProperties props;
+
+ /// The HIP device candidate that this Vulkan device corresponds to.
+ hip_device_candidate hip_candidate;
+
+ /// The queue allocation that contains details about which queues will be
+ /// used throughout this example.
+ queue_allocation queues;
+};
+
+/// \brief Checks if a particular Vulkan physical device is qualified to run this example:
+/// - It needs to support the Vulkan surface which we want to render to.
+/// - It needs to support the required generic and platform-specific Vulkan device extensions.
+/// - It needs to be a HIP-supported device. This is checked by fetching the device
+/// UUID from Vulkan, and checking if it appears in the device UUIDs fetched from HIP
+/// (passed through \p hip_uuids).
+/// - It needs to support graphics- and present queues that can render to the surface.
+/// If all of these are satisfied, the \p candidate structure is filled with information
+/// about the physical device that is required later, and the function returns \p true.
+/// Otherwise, \p false is returned.
+///
+/// \param hip_devices - A vector of \p hipDevice_t and their corresponding Vulkan-compatible
+/// device UUID.
+/// \param pdev - The Vulkan physical device to check suitability off.
+/// \p surface - The Vulkan surface that the physical device needs to support.
+bool is_physical_device_suitable(const instance_dispatch& dispatch,
+ const std::vector hip_devices,
+ VkPhysicalDevice pdev,
+ VkSurfaceKHR surface,
+ physical_device_candidate& candidate)
+{
+ // Check if HIP supports this device by checking if there is any device with the same UUID.
+ {
+ // Query the Vulkan device UUID using vkGetPhysicalDeviceProperties2.
+ VkPhysicalDeviceIDPropertiesKHR id_props = {};
+ id_props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR;
+
+ VkPhysicalDeviceProperties2KHR props2 = {};
+ props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
+ props2.pNext = &id_props;
+
+ dispatch.get_physical_device_properties2(pdev, &props2);
+
+ const auto cmp_device_uuid = [&](const hip_device_candidate& hip_candidate)
+ {
+ return std::equal(std::begin(hip_candidate.device_uuid.bytes),
+ std::end(hip_candidate.device_uuid.bytes),
+ std::begin(id_props.deviceUUID),
+ std::end(id_props.deviceUUID));
+ };
+
+ // Try to find a HIP device UUID that matches the UUID reported by Vulkan - if any such exists,
+ // we know that the device supports both Vulkan and HIP, and we can use it to run this example.
+ const auto it = std::find_if(hip_devices.begin(), hip_devices.end(), cmp_device_uuid);
+ if(it == hip_devices.end())
+ {
+ // This device does not support HIP.
+ return false;
+ }
+
+ candidate.props = props2.properties;
+ candidate.hip_candidate = *it;
+ }
+
+ // Check if the device supports our surface at all.
+ if(!check_surface_support(dispatch, pdev, surface))
+ {
+ return false;
+ }
+
+ // Check if the device supports the required extensions.
+ if(!check_device_extensions(dispatch,
+ pdev,
+ required_device_extensions,
+ std::size(required_device_extensions)))
+ {
+ return false;
+ }
+
+ // Try to allocate device queues for the candidate device.
+ if(!allocate_device_queues(dispatch, pdev, surface, candidate.queues))
+ {
+ return false;
+ }
+
+ candidate.pdev = pdev;
+ return true;
+}
+
+/// \brief Try to find a physical device that can run this example. This is done by fetching
+/// all supported devices from HIP and from Vulkan, and checking each of these to see if the required
+/// features are supported.
+///
+/// To check whether a Vulkan and HIP device are the same, their UUIDs are compared.
+/// \see \p uuid::get_hip_device_uuid.
+/// \see \p is_physical_device_suitable.
+void find_physical_device(const instance_dispatch& dispatch,
+ VkInstance instance,
+ VkSurfaceKHR surface,
+ physical_device_candidate& candidate)
+{
+ uint32_t physical_device_count;
+ VK_CHECK(dispatch.enumerate_physical_devices(instance, &physical_device_count, nullptr));
+ std::vector physical_devices(physical_device_count);
+ VK_CHECK(dispatch.enumerate_physical_devices(instance,
+ &physical_device_count,
+ physical_devices.data()));
+
+ if(physical_device_count == 0)
+ {
+ std::cerr << "System has no physical devices\n";
+ std::exit(error_exit_code);
+ }
+
+ // Fetch the number of HIP devices that are currently present on the system.
+ // Note: This depends on the current HIP platform, and may report different
+ // devices depending on that.
+ int hip_device_count;
+ HIP_CHECK(hipGetDeviceCount(&hip_device_count));
+
+ // For each HIP device, check to see if we can use it all, and then query
+ // its Vulkan-compatible device UUID.
+ std::vector hip_devices;
+ for(hipDevice_t hip_device = 0; hip_device < hip_device_count; ++hip_device)
+ {
+ hipDeviceProp_t hip_properties;
+ HIP_CHECK(hipGetDeviceProperties(&hip_properties, hip_device));
+ if(hip_properties.computeMode == hipComputeModeProhibited)
+ continue;
+
+ const uuid device_uuid = uuid::get_hip_device_uuid(hip_device);
+ hip_devices.push_back({hip_device, device_uuid});
+ }
+
+ for(VkPhysicalDevice pdev : physical_devices)
+ {
+ if(is_physical_device_suitable(dispatch, hip_devices, pdev, surface, candidate))
+ {
+ return;
+ }
+ }
+
+ std::cerr << "No suitable device\n";
+ std::exit(error_exit_code);
+}
+
+/// \brief Allocate and bind memory for a Vulkan buffer
+/// \param buffer - The buffer to allocate create memory for.
+/// \param properties - The memory properties for the allocated memory.
+/// \param external - Whether to allocate this memory such that it can be exported.
+VkDeviceMemory allocate_buffer_memory(const graphics_context& ctx,
+ const VkBuffer buffer,
+ const VkMemoryPropertyFlags properties,
+ const bool external = false)
+{
+ VkMemoryRequirements mem_reqs;
+ ctx.vkd->get_buffer_memory_requirements(ctx.dev, buffer, &mem_reqs);
+
+ const uint32_t memory_type = ctx.find_memory_type_index(mem_reqs.memoryTypeBits, properties);
+
+ VkMemoryAllocateInfo allocate_info = {};
+ allocate_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+ allocate_info.allocationSize = mem_reqs.size;
+ allocate_info.memoryTypeIndex = memory_type;
+
+ VkExportMemoryAllocateInfoKHR export_info = {};
+ export_info.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
+#ifdef _WIN64
+ export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+#else
+ export_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+#endif
+
+ if(external)
+ {
+ allocate_info.pNext = &export_info;
+ }
+
+ VkDeviceMemory memory;
+ VK_CHECK(ctx.vkd->allocate_memory(ctx.dev, &allocate_info, nullptr, &memory));
+ VK_CHECK(ctx.vkd->bind_buffer_memory(ctx.dev, buffer, memory, 0));
+ return memory;
+}
+
+/// \brief Create and allocate a Vulkan buffer.
+/// \param size - The size (in bytes) that this buffer should be allocated for.
+/// \param usage - The Vulkan usage that this buffer will be used for.
+/// \param external - If true, this buffer will be created so that it can later be exported to a
+/// platform-native handle, that may be imported to HIP.
+VkBuffer create_buffer(const graphics_context& ctx,
+ const VkDeviceSize size,
+ const VkBufferUsageFlags usage,
+ const bool external = false)
+{
+ VkBufferCreateInfo create_info = {};
+ create_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+ create_info.size = size;
+ create_info.usage = usage;
+ create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+ // In order to be able to export the buffer handle, we need to supply Vulkan with this
+ // VkExternalMemoryBufferCreateInfoKHR, and set the handleTypes to the native handle type
+ // that we want to export. Which handle type to export depends on the platform we are
+ // currently compiling for.
+ VkExternalMemoryBufferCreateInfoKHR external_create_info = {};
+ external_create_info.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR;
+#ifdef _WIN64
+ external_create_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+#else
+ external_create_info.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+#endif
+ // If exporting, add the external buffer create information to the buffer's create info
+ // so that it gets passed to Vulkan.
+ if(external)
+ {
+ create_info.pNext = &external_create_info;
+ }
+
+ VkBuffer buffer;
+ VK_CHECK(ctx.vkd->create_buffer(ctx.dev, &create_info, nullptr, &buffer));
+ return buffer;
+}
+
+/// \brief This function converts a Vulkan memory handle to its equivalent HIP handle. The
+/// VkDeviceMemory passed to this function and the returned HIP memory represents the same
+/// physical area of GPU memory, through the handles of each respective API. Writing to the
+/// buffer in one API will allow us to read the results through the other. Note that access
+/// to the buffer should be synchronized between the APIs, for example using queue syncs or
+/// semaphores.
+/// \param memory - The Vulkan memory handle to convert. This memory needs to be created with
+/// the appropriate fields set in VkExportMemoryAllocateInfoKHR.
+/// \see allocate_buffer_memory for allocating such a memory handle, and
+/// \see create_buffer for creating a Vulkan buffer that is compatible with that memory.
+hipExternalMemory_t
+ memory_to_hip(const graphics_context& ctx, const VkDeviceMemory memory, const VkDeviceSize size)
+{
+ // [Sphinx vulkan memory to hip start]
+ // Prepare the HIP external semaphore descriptor with the platform-specific
+ // handle type that we wish to import. This value should correspond to the
+ // handleTypes field set in VkExportMemoryAllocateInfoKHR while creating the
+ // Vulkan buffer.
+ hipExternalMemoryHandleDesc desc = {};
+ desc.size = size;
+
+ // Export the Vulkan buffer handle to a platform-specific native handle, depending
+ // on the current platform: On Windows the buffer is converted to a HANDLE, and on Linux
+ // to a file descriptor representing the driver's GPU handle to the memory.
+ // This native handle is then passed to the HIP external memory descriptor so that it
+ // may be imported.
+#ifdef _WIN64
+ desc.type = hipExternalMemoryHandleTypeOpaqueWin32Kmt;
+
+ VkMemoryGetWin32HandleInfoKHR get_handle_info = {};
+ get_handle_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
+ get_handle_info.memory = memory;
+ get_handle_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+
+ VK_CHECK(
+ ctx.vkd->get_memory_win32_handle(ctx.dev, &get_handle_info, &desc.handle.win32.handle));
+#else
+ desc.type = hipExternalMemoryHandleTypeOpaqueFd;
+
+ VkMemoryGetFdInfoKHR get_fd_info = {};
+ get_fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+ get_fd_info.memory = memory;
+ get_fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+
+ VK_CHECK(ctx.vkd->get_memory_fd(ctx.dev, &get_fd_info, &desc.handle.fd));
+#endif
+
+ // Import the native memory handle to HIP to create an external memory.
+ hipExternalMemory_t hip_memory;
+ HIP_CHECK(hipImportExternalMemory(&hip_memory, &desc));
+ return hip_memory;
+ // [Sphinx vulkan memory to hip end]
+}
+
+/// \brief Utility function to create a Vulkan semaphore.
+/// \param external - If true, this semaphore is created so that it can later be exported
+/// to a platform-native handle, which may be imported to HIP later.
+VkSemaphore create_semaphore(const graphics_context& ctx, const bool external = false)
+{
+ VkSemaphoreCreateInfo create_info = {};
+ create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+ // Similar to buffers, in order to be able to export the semaphore handle we need to supply
+ // Vulkan with this VkExportSemaphoreCreateInfoKHR structure, and set the handleTypes to the
+ // value appropriate for the platform that we are currently compiling for.
+ VkExportSemaphoreCreateInfoKHR export_create_info = {};
+ export_create_info.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
+#ifdef _WIN64
+ export_create_info.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+#else
+ export_create_info.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+#endif
+
+ // If exporting, add the export structure to the create info chain.
+ if(external)
+ {
+ create_info.pNext = &export_create_info;
+ }
+
+ VkSemaphore sema;
+ VK_CHECK(ctx.vkd->create_semaphore(ctx.dev, &create_info, nullptr, &sema));
+ return sema;
+}
+
+/// \brief This function converts a Vulkan semaphore to its equivalent HIP handle. The passed
+/// semaphore and the returned HIP semaphore represent the same backing semaphore, though the
+/// handles of the respective API. Signaling on the semaphore in one API will allow the other
+/// API to wait on it, which is how we can guarantee synchronized access to resources in a
+/// cross-API manner.
+/// \param sema - The Vulkan semaphore to convert. This semaphore needs to be created with
+/// \p the appropriate fields set in VkExportSemaphoreCreateInfoKHR.
+/// \see create_semaphore for creating such a semaphore.
+hipExternalSemaphore_t semaphore_to_hip(const graphics_context& ctx, const VkSemaphore sema)
+{
+ // [Sphinx semaphore import start]
+ // Prepare the HIP external semaphore descriptor with the platform-specific handle type
+ // that we wish to import. This value should correspond to the handleTypes field set in
+ // the VkExportSemaphoreCreateInfoKHR structure that was passed to Vulkan when creating
+ // the semaphore.
+ hipExternalSemaphoreHandleDesc desc = {};
+
+ // Export the Vulkan semaphore to a platform-specific handle depending on the current
+ // platform: On Windows, we convert the semaphore into a HANDLE, and on Linux it is
+ // converted to a file descriptor.
+ // This native handle is then passed to the HIP external semaphore descriptor.
+#ifdef _WIN64
+ desc.type = hipExternalSemaphoreHandleTypeOpaqueWin32;
+
+ VkSemaphoreGetWin32HandleInfoKHR get_handle_info = {};
+ get_handle_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
+ get_handle_info.semaphore = sema;
+ get_handle_info.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+
+ VK_CHECK(
+ ctx.vkd->get_semaphore_win32_handle(ctx.dev, &get_handle_info, &desc.handle.win32.handle));
+
+#else
+ desc.type = hipExternalSemaphoreHandleTypeOpaqueFd;
+
+ VkSemaphoreGetFdInfoKHR get_fd_info = {};
+ get_fd_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+ get_fd_info.semaphore = sema;
+ get_fd_info.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+
+ VK_CHECK(ctx.vkd->get_semaphore_fd(ctx.dev, &get_fd_info, &desc.handle.fd));
+#endif
+
+ // Import the native semaphore to HIP to create a HIP external semaphore.
+ hipExternalSemaphore_t hip_sema;
+ HIP_CHECK(hipImportExternalSemaphore(&hip_sema, &desc));
+ // [Sphinx semaphore import end]
+ return hip_sema;
+}
+
+/// \brief When the HIP external memory is exported from Vulkan and imported to HIP, it
+/// is not yet ready for use. To actually use the memory, we need to map it to a pointer
+/// so that we may pass it to the kernel so that it can be read from and written to.
+void* map_hip_external_memory(const hipExternalMemory_t mem, const VkDeviceSize size)
+{
+ // [Sphinx map external memory start]
+ hipExternalMemoryBufferDesc desc = {};
+ desc.offset = 0;
+ desc.size = size;
+ desc.flags = 0;
+
+ void* ptr;
+ HIP_CHECK(hipExternalMemoryGetMappedBuffer(&ptr, mem, &desc));
+ // [Sphinx map external memory end]
+ return ptr;
+}
+
+// [Sphinx sinewave kernel start]
+/// \brief The main HIP kernel for this example - computes a simple sine wave over a
+/// 2-dimensional grid of points.
+/// \param height_map - the grid of points to compute a sine wave for. It is expected to be
+/// a \p grid_width by \p grid_height array packed into memory.(y on the inner axis).
+/// \param time - The current time relative to the start of the program.
+__global__ void sinewave_kernel(float* height_map, const float time)
+{
+ const float freq = 10.f;
+ const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+ const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+ const float u = (2.f * x) / grid_width - 1.f;
+ const float v = (2.f * y) / grid_height - 1.f;
+
+ if(x < grid_width && y < grid_height)
+ {
+ height_map[x * grid_width + y] = sinf(u * freq + time) * cosf(v * freq + time);
+ }
+}
+// [Sphinx sinewave kernel end]
+
+/// \brief In order to increase efficiency, we pipeline the rendering process. This allows us to render
+/// the next frame already while another frame is being presented by Vulkan. The \p frame structure
+/// contains the relevant Vulkan handles that are duplicated for each phase of the pipeline.
+struct frame
+{
+ const graphics_context& ctx;
+
+ /// The semaphore that guards the use of the swapchain image before it is ready.
+ VkSemaphore image_acquired;
+ /// The semaphore that guards the present before the image is rendered.
+ VkSemaphore render_finished;
+ /// A fence that allows us to synchronize on CPU until this frame is ready
+ /// to be re-rendered again after it has been submitted to the GPU.
+ VkFence frame_fence;
+ /// The command pool that the command buffer for this frame will is allocated from.
+ /// By having a separate pool for each frame we can reset the command for the frame simply
+ /// by resetting the pool.
+ VkCommandPool cmd_pool;
+ /// The main command buffer for this frame.
+ VkCommandBuffer cmd_buf;
+
+ /// \brief Create a new frame.
+ explicit frame(const graphics_context& ctx) : ctx(ctx)
+ {
+ this->image_acquired = create_semaphore(ctx);
+ this->render_finished = create_semaphore(ctx);
+
+ VkFenceCreateInfo fence_create_info = {};
+ fence_create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+ fence_create_info.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+ VK_CHECK(ctx.vkd->create_fence(ctx.dev, &fence_create_info, nullptr, &this->frame_fence));
+
+ VkCommandPoolCreateInfo cmd_pool_create_info = {};
+ cmd_pool_create_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+ cmd_pool_create_info.queueFamilyIndex = ctx.graphics_queue.family;
+ VK_CHECK(
+ ctx.vkd->create_command_pool(ctx.dev, &cmd_pool_create_info, nullptr, &this->cmd_pool));
+
+ VkCommandBufferAllocateInfo cmd_buf_allocate_info = {};
+ cmd_buf_allocate_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+ cmd_buf_allocate_info.commandPool = this->cmd_pool;
+ cmd_buf_allocate_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+ cmd_buf_allocate_info.commandBufferCount = 1;
+ VK_CHECK(
+ ctx.vkd->allocate_command_buffers(ctx.dev, &cmd_buf_allocate_info, &this->cmd_buf));
+ }
+
+ ~frame()
+ {
+ this->ctx.vkd->destroy_command_pool(this->ctx.dev, this->cmd_pool, nullptr);
+ this->ctx.vkd->destroy_fence(this->ctx.dev, this->frame_fence, nullptr);
+ this->ctx.vkd->destroy_semaphore(this->ctx.dev, this->image_acquired, nullptr);
+ this->ctx.vkd->destroy_semaphore(this->ctx.dev, this->render_finished, nullptr);
+ }
+
+ /// \brief Wait until the GPU-work for this frame has been completed, so that we
+ /// can render to it again.
+ void wait() const
+ {
+ VK_CHECK(this->ctx.vkd->wait_for_fences(this->ctx.dev,
+ 1,
+ &this->frame_fence,
+ VK_TRUE,
+ frame_timeout));
+ }
+
+ /// \brief Reset the fence that backs this frame.
+ void reset() const
+ {
+ VK_CHECK(this->ctx.vkd->reset_fences(this->ctx.dev, 1, &this->frame_fence));
+ }
+};
+
+/// \brief This structure contains all the rendering related information for this example.
+/// Its contents differ itself from the \p graphics_context in that in a typical Vulkan programs
+/// there is usually only one graphics_context-like structure, but there may be multiple
+/// renderer-like structures. In this example though, there is only one.
+///
+/// This renderer renders a grid of triangles to the window, the color of which is determined by
+/// a HIP computation. Rendering is done using 3 buffers:
+/// - One buffer contains the height of each triangle (rendered as color).
+/// - One buffer holds the x- and y-coordinates for each of the corners of the triangle. Note: these
+/// coordinates are unique, as the triangles that are made up from these points are defined by the
+/// - Index buffer, that holds indices into the former two buffers to make up a list of triangles.
+struct renderer
+{
+ /// The total number of vertices for the triangles.
+ constexpr static size_t num_verts = grid_width * grid_height;
+ /// The number of bytes in the x- and y-coordinates buffer. Each x/y coordinate is encoded as
+ /// a pair of floats, which are stored in a packed array-of-structures format: | x | y | x | y | ... |.
+ constexpr static size_t grid_buffer_size = num_verts * sizeof(float) * 2;
+ /// The number of bytes in the height buffer. Each height is encoded as a floating point value.
+ /// This buffer will be shared with HIP, which is why these coordinates are
+ /// stored in a separate buffer.
+ constexpr static size_t height_buffer_size = num_verts * sizeof(float);
+
+ /// The number of indices in the index buffer. Each triangle has 3 points, each square in the grid
+ /// is made up of 2 triangles. There are (width - 1) by (height - 1) squares in the grid.
+ constexpr static size_t num_indices = (grid_width - 1) * (grid_height - 1) * 3 * 2;
+ /// The number of bytes in the index buffer. Each index is encoded as a 32-bit int.
+ constexpr static size_t index_buffer_size = num_indices * sizeof(uint32_t);
+
+ const graphics_context& ctx;
+ swapchain& sc;
+
+ hipDevice_t hip_device;
+ hipStream_t hip_stream;
+
+ VkRenderPass render_pass;
+
+ /// The frames in the rendering pipeline.
+ std::vector frames;
+ /// The index of the frame we are currently rendering to.
+ uint32_t frame_index = 0;
+
+ /// The Vulkan frame buffers to render to - each corresponds to a swapchain
+ /// image with the same index in sc
+ std::vector framebuffers;
+
+ /// The pipeline layout and pipeline of the rendering pipeline for the Vulkan part
+ /// of this example.
+ VkPipelineLayout pipeline_layout;
+ VkPipeline pipeline;
+
+ /// Whether the swapchain is out-of-date and needs to be recreated.
+ bool swapchain_out_of_date = false;
+
+ /// The buffer and memory holding the grid coordinates.
+ VkBuffer grid_buffer;
+ VkDeviceMemory grid_memory;
+ /// The buffer and memory holding the grid heights.
+ /// This buffer will be exported to HIP.
+ /// \see hip_height_memory.
+ /// \see hip_height_buffer.
+ VkBuffer height_buffer;
+ VkDeviceMemory height_memory;
+ /// The buffer and memory holding the indices for the triangles to render.
+ VkBuffer index_buffer;
+ VkDeviceMemory index_memory;
+
+ /// The HIP-imported version of \p height_buffer.
+ hipExternalMemory_t hip_height_memory;
+ /// The HIP-imported version of \p height_buffer mapped into the program's memory.
+ float* hip_height_buffer;
+
+ /// The semaphore that guards between when the buffer has been rendered from the
+ /// Vulkan side and when we can simulate it again from the HIP side, and
+ /// its hip-imported version.
+ VkSemaphore buffer_ready;
+ hipExternalSemaphore_t hip_buffer_ready;
+
+ /// The semaphore that guards between when the simulation has finished from the HIP
+ /// side and when we can render it to the swapchain in the Vulkan side, and its HIP-
+ /// imported version.
+ VkSemaphore simulation_finished;
+ hipExternalSemaphore_t hip_simulation_finished;
+
+ /// The time at which this example started.
+ std::chrono::high_resolution_clock::time_point start_time;
+
+ /// Counters used to keep track of the current performance.
+ uint32_t fps_start_frame = 0;
+ std::chrono::high_resolution_clock::time_point fps_start_time;
+
+ /// \brief Initialize a new renderer.
+ renderer(const graphics_context& ctx, swapchain& sc, const hipDevice_t hip_device)
+ : ctx(ctx), sc(sc), hip_device(hip_device)
+ {
+ // Create a HIP stream for the (hip) device that was selected, which compute commands will be scheduled to later.
+ HIP_CHECK(hipSetDevice(this->hip_device));
+ HIP_CHECK(hipStreamCreate(&this->hip_stream));
+
+ // Initialize the Vulkan resources related to this renderer.
+ this->render_pass = sc.create_render_pass();
+ this->pipeline_layout = this->ctx.create_pipeline_layout();
+ this->create_pipeline();
+
+ this->frames.reserve(max_frames_in_flight);
+ for(size_t i = 0; i < max_frames_in_flight; ++i)
+ {
+ this->frames.emplace_back(ctx);
+ }
+
+ this->sc.recreate_framebuffers(this->render_pass, this->framebuffers);
+
+ // Create each of the buffers, and allocate memory for them.
+
+ this->grid_buffer
+ = create_buffer(ctx,
+ grid_buffer_size,
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
+
+ // This buffer is going to be exported to HIP, so we should create it as
+ // an external buffer.
+ this->height_buffer
+ = create_buffer(ctx,
+ height_buffer_size,
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+ true);
+
+ this->index_buffer
+ = create_buffer(ctx,
+ index_buffer_size,
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
+
+ // Allocate the memory for each buffer.
+
+ this->grid_memory
+ = allocate_buffer_memory(ctx, this->grid_buffer, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+ // Allocate this memory in a way that supports exporting.
+ this->height_memory = allocate_buffer_memory(ctx,
+ this->height_buffer,
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ true);
+ this->index_memory
+ = allocate_buffer_memory(ctx, this->index_buffer, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+
+ // Upload the initial data to the buffers.
+ this->initialize_buffer_data();
+
+ // Export the height buffer and import it in HIP.
+ this->hip_height_memory = memory_to_hip(this->ctx, this->height_memory, height_buffer_size);
+ // Map it into memory.
+ this->hip_height_buffer = reinterpret_cast(
+ map_hip_external_memory(this->hip_height_memory, height_buffer_size));
+
+ // Create the Vulkan-HIP synchronization resources from Vulkan and import them in HIP.
+#if USE_EXTERNAL_SEMAPHORES == 1
+ this->buffer_ready = create_semaphore(this->ctx, true);
+ this->hip_buffer_ready = semaphore_to_hip(this->ctx, this->buffer_ready);
+
+ this->simulation_finished = create_semaphore(this->ctx, true);
+ this->hip_simulation_finished = semaphore_to_hip(this->ctx, this->simulation_finished);
+#endif
+
+ // Initialize performance counters.
+ this->start_time = std::chrono::high_resolution_clock::now();
+ this->fps_start_time = this->start_time;
+ }
+
+ ~renderer()
+ {
+ // Be sure that rendering is finished
+ this->wait_all_frames();
+
+ // Make sure that all work has been finished before destroying the stream.
+ HIP_CHECK(hipStreamSynchronize(this->hip_stream));
+ HIP_CHECK(hipStreamDestroy(this->hip_stream));
+
+ // Destroy Vulkan-HIP synchronization resources.
+#if USE_EXTERNAL_SEMAPHORES == 1
+ HIP_CHECK(hipDestroyExternalSemaphore(this->hip_buffer_ready));
+ HIP_CHECK(hipDestroyExternalSemaphore(this->hip_simulation_finished));
+
+ this->ctx.vkd->destroy_semaphore(this->ctx.dev, this->buffer_ready, nullptr);
+ this->ctx.vkd->destroy_semaphore(this->ctx.dev, this->simulation_finished, nullptr);
+#endif
+
+ // Destroy the HIP external memory handle. We don't need to unmap it.
+ HIP_CHECK(hipDestroyExternalMemory(this->hip_height_memory));
+
+ // Destroy Vulkan device memory & buffer handles.
+ this->ctx.vkd->free_memory(this->ctx.dev, this->index_memory, nullptr);
+ this->ctx.vkd->free_memory(this->ctx.dev, this->height_memory, nullptr);
+ this->ctx.vkd->free_memory(this->ctx.dev, this->grid_memory, nullptr);
+ this->ctx.vkd->destroy_buffer(this->ctx.dev, this->index_buffer, nullptr);
+ this->ctx.vkd->destroy_buffer(this->ctx.dev, this->height_buffer, nullptr);
+ this->ctx.vkd->destroy_buffer(this->ctx.dev, this->grid_buffer, nullptr);
+
+ this->ctx.vkd->destroy_pipeline_layout(this->ctx.dev, this->pipeline_layout, nullptr);
+ this->ctx.vkd->destroy_pipeline(this->ctx.dev, this->pipeline, nullptr);
+
+ for(const VkFramebuffer fb : this->framebuffers)
+ {
+ this->ctx.vkd->destroy_framebuffer(this->ctx.dev, fb, nullptr);
+ }
+
+ this->ctx.vkd->destroy_render_pass(this->ctx.dev, this->render_pass, nullptr);
+ }
+
+ renderer(const renderer&) = delete;
+ renderer& operator=(const renderer&) = delete;
+
+ renderer(renderer&&) = delete;
+ renderer& operator=(renderer&&) = delete;
+
+ /// \brief Block until all current frames have finished rendering.
+ void wait_all_frames()
+ {
+ for(const frame& frame : this->frames)
+ {
+ frame.wait();
+ }
+ }
+
+ /// \brief Upload the initial values for each buffer to Vulkan.
+ void initialize_buffer_data()
+ {
+ // Create a "staging" buffer that is accessible from the CPU, that we will be using to
+ // upload data to. We can re-use the same staging buffer for all three buffers, so create it
+ // so that it is able to hold the maximum size of all three buffers.
+ constexpr size_t staging_buffer_size = std::max(grid_buffer_size, index_buffer_size);
+ VkBuffer staging_buffer
+ = create_buffer(ctx, staging_buffer_size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
+ VkDeviceMemory staging_memory = allocate_buffer_memory(
+ ctx,
+ staging_buffer,
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+
+ // Map the staging buffer into host memory.
+ void* staging;
+ VK_CHECK(
+ this->ctx.vkd
+ ->map_memory(this->ctx.dev, staging_memory, 0, staging_buffer_size, 0, &staging));
+
+ // Initialize the height buffer
+ {
+ std::memset(staging, 0, height_buffer_size);
+ this->ctx.copy_buffer(this->height_buffer, staging_buffer, height_buffer_size);
+ }
+
+ // Initialize the grid buffer
+ {
+ float* grid = reinterpret_cast(staging);
+ for(uint32_t y = 0; y < grid_height; ++y)
+ {
+ for(uint32_t x = 0; x < grid_width; ++x)
+ {
+ *grid++ = (2.0f * x) / (grid_width - 1) - 1;
+ *grid++ = (2.0f * y) / (grid_height - 1) - 1;
+ }
+ }
+
+ this->ctx.copy_buffer(this->grid_buffer, staging_buffer, grid_buffer_size);
+ }
+
+ // Initialize the index buffer
+ {
+ uint32_t* indices = reinterpret_cast(staging);
+ for(uint32_t y = 0; y < grid_height - 1; ++y)
+ {
+ for(uint32_t x = 0; x < grid_width - 1; ++x)
+ {
+ *indices++ = (y + 0) * grid_width + (x + 0);
+ *indices++ = (y + 1) * grid_width + (x + 0);
+ *indices++ = (y + 0) * grid_width + (x + 1);
+ *indices++ = (y + 1) * grid_width + (x + 0);
+ *indices++ = (y + 1) * grid_width + (x + 1);
+ *indices++ = (y + 0) * grid_width + (x + 1);
+ }
+ }
+
+ this->ctx.copy_buffer(this->index_buffer, staging_buffer, index_buffer_size);
+ }
+
+ // We are done with the staging buffer so clean it up.
+ this->ctx.vkd->unmap_memory(this->ctx.dev, staging_memory);
+ this->ctx.vkd->free_memory(this->ctx.dev, staging_memory, nullptr);
+ this->ctx.vkd->destroy_buffer(this->ctx.dev, staging_buffer, nullptr);
+ }
+
+ /// \brief Initialize the Vulkan pipeline for the renderer.
+ void create_pipeline()
+ {
+ VkShaderModule vert
+ = create_shader_module(this->ctx, std::size(sinewave_vert), sinewave_vert);
+ VkShaderModule frag
+ = create_shader_module(this->ctx, std::size(sinewave_frag), sinewave_frag);
+
+ // Keep in sync with shaders!
+ VkPipelineShaderStageCreateInfo pssci[2] = {};
+ pssci[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+ pssci[0].stage = VK_SHADER_STAGE_VERTEX_BIT;
+ pssci[0].module = vert;
+ pssci[0].pName = "main";
+ pssci[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+ pssci[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
+ pssci[1].module = frag;
+ pssci[1].pName = "main";
+
+ // Keep in sync with shaders!
+ VkVertexInputBindingDescription bindings[2] = {};
+ bindings[0].binding = 0;
+ bindings[0].stride = sizeof(float);
+ bindings[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+ bindings[1].binding = 1;
+ bindings[1].stride = sizeof(float) * 2;
+ bindings[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+ // Keep in sync with shaders!
+ VkVertexInputAttributeDescription attribs[2] = {};
+ attribs[0].binding = 0;
+ attribs[0].location = 0;
+ attribs[0].format = VK_FORMAT_R32_SFLOAT;
+ attribs[1].binding = 1;
+ attribs[1].location = 1;
+ attribs[1].format = VK_FORMAT_R32G32_SFLOAT;
+
+ this->pipeline = this->ctx.create_simple_pipeline(this->pipeline_layout,
+ this->render_pass,
+ pssci,
+ std::size(pssci),
+ bindings,
+ std::size(bindings),
+ attribs,
+ std::size(attribs));
+
+ // Shader modules do not need to be kept around in memory.
+ this->ctx.vkd->destroy_shader_module(this->ctx.dev, vert, nullptr);
+ this->ctx.vkd->destroy_shader_module(this->ctx.dev, frag, nullptr);
+ }
+
+ /// \brief Re-create the backing swapchain and re-initialize frame buffers if the swapchain
+ /// has become outdated.
+ bool recreate_swapchain(GLFWwindow* const window)
+ {
+ VK_CHECK(this->ctx.vkd->queue_wait_idle(this->ctx.present_queue.queue));
+ int width, height;
+ glfwGetFramebufferSize(window, &width, &height);
+ if(width == 0 || height == 0)
+ {
+ return false;
+ }
+
+ this->sc.recreate({static_cast(width), static_cast(height)});
+ this->sc.recreate_framebuffers(this->render_pass, this->framebuffers);
+
+ return true;
+ }
+
+ /// \brief Start rendering the next frame
+ /// \returns if the frame can be rendered at all. This may not be the case on
+ /// some operating systems for example if the window is minimized and has a
+ /// surface extent of 0 by 0 pixels.
+ bool begin_frame(GLFWwindow* const window)
+ {
+ const frame& frame = frames[this->frame_index % this->frames.size()];
+ // Wait until the previous instance of this frame is done rendering.
+ frame.wait();
+
+ // Acquire the next image index from the swapchain.
+ // Re-create the swapchain if it has become outdated in the meantime.
+ if(this->swapchain_out_of_date)
+ {
+ if(!this->recreate_swapchain(window))
+ return false;
+ this->swapchain_out_of_date = false;
+ }
+
+ const swapchain::present_state present_state
+ = this->sc.acquire_next_image(frame.image_acquired, frame_timeout);
+ switch(present_state)
+ {
+ case swapchain::present_state::optimal: break;
+ case swapchain::present_state::suboptimal:
+ // Sub-optimal, but semaphore is already signaled.
+ // Continue rendering this frame and re-create on the next.
+ this->swapchain_out_of_date = true;
+ break;
+ case swapchain::present_state::out_of_date:
+ // Need to re-create immediately.
+ this->swapchain_out_of_date = true;
+ return false;
+ }
+
+ // Reset the fence backing the frame now that we are creating work.
+ frame.reset();
+
+ // Reset the command pool and initialize the command buffer so that we can start submitting
+ // draw commands to it.
+ VK_CHECK(this->ctx.vkd->reset_command_pool(this->ctx.dev, frame.cmd_pool, 0));
+ VkCommandBufferBeginInfo begin_info = {};
+ begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+ begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+ VK_CHECK(this->ctx.vkd->begin_command_buffer(frame.cmd_buf, &begin_info));
+
+ return true;
+ }
+
+ /// \brief End the current frame and submit it to the graphics queue for rendering and the
+ /// present queue for presenting.
+ void end_frame()
+ {
+ const frame& frame = frames[this->frame_index % this->frames.size()];
+
+ VK_CHECK(this->ctx.vkd->end_command_buffer(frame.cmd_buf));
+
+ // The semaphores that we need to wait on before this frame can be rendered completely:
+ // - The frame needs to wait before the image is completely acquired from Vulkan. In
+ // vkAcquireNextImageKHR the implementation may already know _which_ image is going to
+ // be rendered to next, but it may not be quite ready for it yet. This is why we need
+ // to wait on it here.
+ // - HIP needs to be finished with the height buffer, and so it also need to wait on the
+ // semaphore that signals that its ready.
+#if USE_EXTERNAL_SEMAPHORES == 1 && USE_SIGNAL_SEMAPHORE == 1
+ VkSemaphore wait_semaphores[] = {frame.image_acquired, this->simulation_finished};
+#else
+ VkSemaphore wait_semaphores[] = {frame.image_acquired};
+#endif
+
+ // The pipeline stage at which each of the corresponding \p wait_semaphores need to be
+ // waited upon. This allows Vulkan to start with some rendering processes even though
+ // the semaphores are not yet signaled:
+ // - We only need the swapchain image when we are actually going to draw to it, we can
+ // already perform the vertex shader for example and the fragment shader to some extent
+ // before the output is actually drawn to the swap image.
+ // - The buffer passed to HIP is used for vertex coordinates during when drawing in Vulkan,
+ // so that buffer needs to be finished (and its associated \p simulation_finished semaphore
+ // needs to be signaled) when we vertex inputs are bound.
+ const VkPipelineStageFlags wait_dst_stage_masks[]
+ = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT};
+
+ // The semaphores that need to be signaled after this step is finished:
+ // - The \p render_finished semaphore allows us to guard the time between when the rendering
+ // commands are finished (and so when the result is on the swapchain image) and when it can
+ // be copied to the GLFW window.
+ // - The \p buffer_ready semaphore signals that the rendering process is finished, and that we
+ // can perform the next step of the simulation. This prevents that HIP is already modifying the
+ // buffer while Vulkan has not completely rendered it to the swapchain image.
+#if USE_EXTERNAL_SEMAPHORES == 1
+ VkSemaphore signal_semaphores[] = {frame.render_finished, this->buffer_ready};
+#else
+ VkSemaphore signal_semaphores[] = {frame.render_finished};
+#endif
+
+ // Submit the current frame's command buffer to the GPU.
+ VkSubmitInfo submit_info = {};
+ submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+ submit_info.waitSemaphoreCount = std::size(wait_semaphores);
+ submit_info.pWaitSemaphores = wait_semaphores;
+ submit_info.pWaitDstStageMask = wait_dst_stage_masks;
+ submit_info.signalSemaphoreCount = std::size(signal_semaphores);
+ submit_info.pSignalSemaphores = signal_semaphores;
+ submit_info.commandBufferCount = 1;
+ submit_info.pCommandBuffers = &frame.cmd_buf;
+ VK_CHECK(this->ctx.vkd->queue_submit(this->ctx.graphics_queue.queue,
+ 1,
+ &submit_info,
+ frame.frame_fence));
+
+ // Then finally ask the swapchain to draw the current image to the GLFW window, when rendering
+ // is finished.
+ const swapchain::present_state present_state = this->sc.present(frame.render_finished);
+ if(present_state != swapchain::present_state::optimal)
+ this->swapchain_out_of_date = true;
+
+ ++this->frame_index;
+ }
+
+ /// \brief This function updates the height buffer with new coordinates.
+ void step_simulation()
+ {
+ // Take care that we are not going to modify the buffer before it is ready.
+#if USE_EXTERNAL_SEMAPHORES == 1
+ // If semaphores are supported and used, we need to wait on it so that it is
+ // certain that Vulkan is no longer using the buffer.
+ // Note: This semaphore is not signaled in the first frame, so we don't need to wait
+ // on it then.
+ if(this->frame_index != 0)
+ {
+ // [Sphinx wait semaphore start]
+ hipExternalSemaphoreWaitParams wait_params = {};
+ HIP_CHECK(hipWaitExternalSemaphoresAsync(&this->hip_buffer_ready,
+ &wait_params,
+ 1,
+ this->hip_stream));
+ // [Sphinx wait semaphore end]
+ }
+#else
+ // If semaphores are not supported or not used, then we need to perform a full queue
+ // sync to be sure that Vulkan is not using the buffer anymore.
+ VK_CHECK(this->ctx.vkd->queue_wait_idle(this->ctx.graphics_queue.queue));
+#endif
+
+ const auto now = std::chrono::high_resolution_clock::now();
+ const float time
+ = std::chrono::duration(now - this->start_time)
+ .count();
+
+ // The tile size to be used for each block of the computation. A tile is
+ // tile_size by tile_size threads in this case, since we are invoking the
+ // computation over a 2D-grid.
+ constexpr size_t tile_size = 8;
+
+ // [Sphinx kernel call start]
+ // Launch the HIP kernel to advance the simulation.
+ sinewave_kernel<<hip_stream>>>(this->hip_height_buffer, time);
+ HIP_CHECK(hipGetLastError());
+ // [Sphinx kernel call end]
+
+ // Signal to Vulkan that we are done with the buffer and that it can proceed
+ // with rendering.
+#if USE_EXTERNAL_SEMAPHORES == 1 && USE_SIGNAL_SEMAPHORE == 1
+ // If semaphores are supported and used, signal the semaphore that indicates
+ // that the simulation has finished.
+ // [Sphinx signal semaphore start]
+ hipExternalSemaphoreSignalParams signal_params = {};
+ HIP_CHECK(hipSignalExternalSemaphoresAsync(&this->hip_simulation_finished,
+ &signal_params,
+ 1,
+ this->hip_stream));
+ // [Sphinx signal semaphore end]
+#else
+ // If semaphores are not used or not supported, we need to again perform a full
+ // queue sync from the HIP side this time.
+ HIP_CHECK(hipStreamSynchronize(this->hip_stream));
+#endif
+ }
+
+ /// \brief Draw the next frame to the window.
+ void draw(GLFWwindow* const window)
+ {
+ if(!this->begin_frame(window))
+ return;
+
+ // Advance the simulation on the HIP side.
+ this->step_simulation();
+
+ // Render the grid to the screen from the Vulkan side.
+ const frame& frame = frames[this->frame_index % this->frames.size()];
+ const VkCommandBuffer cmd_buf = frame.cmd_buf;
+
+ // Initialize the rendering pass
+ VkClearValue clear_color = {};
+
+ VkViewport viewport = {};
+ viewport.width = this->sc.extent.width;
+ viewport.height = this->sc.extent.height;
+ viewport.minDepth = 0;
+ viewport.maxDepth = 1;
+
+ VkRect2D scissor = {};
+ scissor.extent = this->sc.extent;
+
+ const device_dispatch& vkd = *this->ctx.vkd;
+
+ vkd.cmd_set_viewport(cmd_buf, 0, 1, &viewport);
+ vkd.cmd_set_scissor(cmd_buf, 0, 1, &scissor);
+
+ VkRenderPassBeginInfo rp_begin_info = {};
+ rp_begin_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+ rp_begin_info.renderPass = this->render_pass;
+ rp_begin_info.framebuffer = this->framebuffers[this->sc.image_index];
+ rp_begin_info.renderArea = scissor;
+ rp_begin_info.clearValueCount = 1;
+ rp_begin_info.pClearValues = &clear_color;
+ vkd.cmd_begin_render_pass(cmd_buf, &rp_begin_info, VK_SUBPASS_CONTENTS_INLINE);
+
+ // Bind the pipeline that we are using to render with.
+ vkd.cmd_bind_pipeline(cmd_buf, VK_PIPELINE_BIND_POINT_GRAPHICS, this->pipeline);
+
+ VkBuffer vertex_buffers[] = {this->height_buffer, this->grid_buffer};
+ VkDeviceSize offsets[] = {0, 0};
+ vkd.cmd_bind_vertex_buffers(cmd_buf, 0, std::size(vertex_buffers), vertex_buffers, offsets);
+ vkd.cmd_bind_index_buffer(cmd_buf, this->index_buffer, 0, VK_INDEX_TYPE_UINT32);
+
+ // Draw the triangles.
+ vkd.cmd_draw_indexed(cmd_buf, num_indices, 1, 0, 0, 0);
+
+ vkd.cmd_end_render_pass(cmd_buf);
+
+ this->end_frame();
+
+ // Output a native performance measurement.
+ const auto frame_time = std::chrono::high_resolution_clock::now();
+ const auto time_diff = frame_time - this->fps_start_time;
+ if(time_diff > std::chrono::seconds{5})
+ {
+ const auto time_diff_sec
+ = std::chrono::duration_cast>(time_diff).count();
+ const uint32_t frames = this->frame_index - this->fps_start_frame;
+ std::cout << "Average FPS (over " << double_precision(time_diff_sec, 2, true)
+ << " seconds): " << double_precision(frames / time_diff_sec, 2, true) << " ("
+ << double_precision((time_diff_sec * 1000) / frames, 2, true)
+ << " ms per frame)" << std::endl;
+ this->fps_start_frame = this->frame_index;
+ this->fps_start_time = frame_time;
+ }
+ }
+};
+
+/// \brief GLFW window resize callback: If the window is resized then we need to re-create the
+/// swapchain on the next frame.
+void resize_callback(GLFWwindow* const window, const int, const int)
+{
+ renderer* r = reinterpret_cast(glfwGetWindowUserPointer(window));
+ r->swapchain_out_of_date = true;
+}
+
+/// \brief Program entry point.
+int main()
+{
+ // The initial size of the GLFW window when the example is first started.
+ constexpr VkExtent2D initial_window_extent = {1280, 800};
+
+ // Initialize GLFW.
+ glfwSetErrorCallback(
+ [](int code, const char* const message)
+ { std::cerr << "A glfw error encountered: " << message << "(" << code << ")\n"; });
+
+ if(glfwInit() != GLFW_TRUE)
+ {
+ std::cerr << "failed to initialize GLFW\n";
+ return error_exit_code;
+ }
+
+ // Initialize the window.
+ VkApplicationInfo app_info = {};
+ app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+ app_info.pApplicationName = "HIP-Vulkan interop example";
+ app_info.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+ app_info.pEngineName = "rocm-examples";
+ app_info.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+ app_info.apiVersion = VK_MAKE_VERSION(1, 0, 0);
+
+ GLFWwindow* window = create_window(app_info, initial_window_extent);
+
+ // Create the base Vulkan types: Load base function pointers, create instance, load
+ // instance function pointers, create the surface.
+ const auto vkb = std::make_unique(glfwGetInstanceProcAddress);
+ const VkInstance instance = create_instance(*vkb,
+ app_info,
+ required_instance_extensions,
+ std::size(required_instance_extensions));
+ const auto vki = std::make_unique(*vkb, instance);
+ const VkSurfaceKHR surface = create_surface(instance, window);
+
+ // Try to find a physical device that we can use for this example.
+ physical_device_candidate candidate;
+ find_physical_device(*vki, instance, surface, candidate);
+
+ const hipDevice_t hip_device = candidate.hip_candidate.device;
+
+ // Let the user know which device we are using, on both the Vulkan and HIP sides.
+ hipDeviceProp_t hip_props;
+ HIP_CHECK(hipGetDeviceProperties(&hip_props, hip_device));
+
+ std::cout << "Using device " << candidate.props.deviceName << " (hip device " << hip_device
+ << ", UUID " << candidate.hip_candidate.device_uuid << ", compute capability "
+ << hip_props.major << "." << hip_props.minor << ")\n";
+
+ {
+ // Initialize the rendering resources, both the Vulkan and HIP ones.
+ // These are defined in a sub-scope so that the destructors are
+ // invoked before we call `glfwDestroyWindow` and `glfwTerminate`.
+ graphics_context ctx(vki.get(),
+ instance,
+ surface,
+ candidate.pdev,
+ candidate.queues,
+ required_device_extensions,
+ std::size(required_device_extensions));
+
+ swapchain swapchain(ctx, initial_window_extent);
+ renderer renderer(ctx, swapchain, hip_device);
+
+ glfwSetWindowUserPointer(window, reinterpret_cast(&renderer));
+ glfwSetFramebufferSizeCallback(window, resize_callback);
+
+ // The main rendering loop.
+ // Repeat for as long as the window is not closed.
+ while(glfwWindowShouldClose(window) == GLFW_FALSE)
+ {
+ renderer.draw(window);
+ glfwPollEvents();
+ }
+
+ glfwSetFramebufferSizeCallback(window, nullptr);
+ glfwSetWindowUserPointer(window, nullptr);
+ }
+
+ // Destroy the surface and instance now that we are done with them.
+ vki->destroy_surface(instance, surface, nullptr);
+ vki->destroy_instance(instance, nullptr);
+
+ // Clean up GLFW.
+ glfwDestroyWindow(window);
+ glfwTerminate();
+
+ return 0;
+}
diff --git a/docs/tools/example_codes/opengl_interop.hip b/docs/tools/example_codes/opengl_interop.hip
new file mode 100644
index 0000000000..64ece9ddf2
--- /dev/null
+++ b/docs/tools/example_codes/opengl_interop.hip
@@ -0,0 +1,628 @@
+// MIT License
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "nvidia_hip_fix.hpp"
+
+#include "example_utils.hpp"
+
+#include "glad/glad.h"
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+/// \brief The number of triangles that the example's grid is in width.
+constexpr uint32_t grid_width = 256;
+/// \brief The number of triangles that the example's grid is in height.
+constexpr uint32_t grid_height = 256;
+
+/// \brief The OpenGL vertex shader that is used to render the triangles in this example.
+/// The grid x- and y-positions are used to set the triangle coordinates in clip space.
+/// The height value is passed on to the fragment shader.
+constexpr const char* vertex_shader = R"(
+#version 330 core
+
+in float in_height;
+in vec2 in_xy;
+
+out float frag_height;
+
+void main()
+{
+ gl_Position = vec4(in_xy, 0, 1);
+ frag_height = in_height;
+}
+)";
+
+/// \brief The OpenGL fragment shader that is used to render the triangles in this example.
+/// The "height" value is used to shade the vertex. Its values are interpolated linearly
+/// between the vertex and fragment shaders.
+constexpr const char* fragment_shader = R"(
+#version 330 core
+
+in float frag_height;
+
+void main()
+{
+ gl_FragColor = vec4(vec3(frag_height * 0.5 + 0.5), 1.0);
+}
+)";
+
+/// \brief Initialize a GLFW window with initial dimensions.
+GLFWwindow* create_window(const int initial_width, const int initial_height)
+{
+ /// [Sphinx-create-window]
+ glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+ glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+ glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+ glfwWindowHint(GLFW_OPENGL_DEBUG_CONTEXT, GLFW_TRUE);
+
+ GLFWwindow* window = glfwCreateWindow(initial_width,
+ initial_height,
+ "OpenGL-HIP interop example",
+ nullptr,
+ nullptr);
+ if(window == nullptr)
+ {
+ std::cerr << "Failed to create GLFW window\n";
+ std::exit(error_exit_code);
+ }
+ /// [Sphinx-create-window]
+ return window;
+}
+
+/// \brief Select a HIP device that is compatible with the current OpenGL context.
+/// \returns A HIP device-id that is capable of rendering the example. If no
+/// suitable device is found, an error is printed and the program is exited.
+int pick_hip_device()
+{
+ /// [Sphinx-pick device]
+ unsigned int gl_device_count;
+ int hip_device;
+ HIP_CHECK(
+ hipGLGetDevices(&gl_device_count, &hip_device, 1, hipGLDeviceList::hipGLDeviceListAll));
+
+ if(gl_device_count == 0)
+ {
+ std::cerr << "System has no OpenGL-capable HIP devices" << std::endl;
+ std::exit(error_exit_code);
+ }
+ /// [Sphinx-pick device]
+
+ return hip_device;
+}
+
+/// \brief Utility function to compile shader source into an OpenGL shader.
+/// If the shader could not be compiled, this function prints the compile log
+/// and exits the program.
+/// \param type - The OpenGL shader type for this shader, for example
+/// \p GL_VERTEX_SHADER or \p GL_FRAGMENT_SHADER.
+/// \param source - The GLSL source code for the shader.
+GLuint compile_shader(const GLenum type, const char* const source)
+{
+ const GLuint shader = glCreateShader(type);
+
+ const GLint length = static_cast(std::strlen(source));
+ glShaderSource(shader, 1, &source, &length);
+ glCompileShader(shader);
+
+ GLint compile_status;
+ glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
+
+ if(compile_status != GL_TRUE)
+ {
+ // Compiling failed, get the shader log and print it to the user.
+ GLint log_length;
+ glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
+ std::vector log(log_length);
+ glGetShaderInfoLog(shader, length, nullptr, log.data());
+ std::cerr << "Failed to compile shader:\n";
+ std::cerr.write(log.data(), log.size()) << std::endl;
+ std::exit(error_exit_code);
+ }
+
+ return shader;
+}
+
+/// \brief Utility function to compile and link a vertex and fragment shader into an OpenGL
+/// shader program.
+/// If the shaders could not be compiled, a log is printed and the program is exited.
+/// \param vert_src - The GLSL source code for the shader program's vertex shader.
+/// \param frag_src - The GLSL source code for the shader program's fragment shader.
+GLuint compile_shader_program(const char* const vert_src, const char* const frag_src)
+{
+ const GLuint program = glCreateProgram();
+
+ const GLuint vert = compile_shader(GL_VERTEX_SHADER, vert_src);
+ const GLuint frag = compile_shader(GL_FRAGMENT_SHADER, frag_src);
+
+ glAttachShader(program, frag);
+ glAttachShader(program, vert);
+
+ glLinkProgram(program);
+
+ GLint link_status;
+ glGetProgramiv(program, GL_LINK_STATUS, &link_status);
+ if(link_status != GL_TRUE)
+ {
+ // Linking failed, get the program link log and print it to the user.
+ GLint log_length;
+ glGetProgramiv(program, GL_INFO_LOG_LENGTH, &log_length);
+ std::vector log(log_length);
+ glGetProgramInfoLog(program, log_length, nullptr, log.data());
+ std::cerr << "Failed to link program:\n";
+ std::cerr.write(log.data(), log.size()) << std::endl;
+ std::exit(error_exit_code);
+ }
+
+ glDetachShader(program, frag);
+ glDetachShader(program, vert);
+
+ glDeleteShader(frag);
+ glDeleteShader(vert);
+
+ return program;
+}
+
+/// \brief This structure contains the OpenGL handles that this example uses to render the
+/// triangle grid to the screen.
+///
+/// Three buffers are used to render the triangle grid, the color of which is determined by
+/// a HIP compulation in \p simulator:
+/// - One buffer contains the height of each triangle (rendered as color).
+/// - One buffer holds the x- and y-coordinates for each of the corners of the triangle. Note: these
+/// coordinates are unique, as the triangles that are made up from these points are defined by the
+/// - Index buffer, that holds indices into the former two buffers to make up a list of triangles.
+struct renderer
+{
+ /// The total number of vertices for the triangles.
+ constexpr static size_t num_verts = grid_width * grid_height;
+ /// The number of bytes in the x- and y-coordinates buffer. Each x/y coordinate is encoded as
+ /// a pair of floats, which are stored in a packed array-of-structures format: | x | y | x | y | ... |.
+ constexpr static size_t grid_buffer_size = num_verts * sizeof(float) * 2;
+ /// The number of bytes in the height buffer. Each height is encoded as a floating point value.
+ /// This buffer will be shared with HIP, which is why these coordinates are
+ /// stored in a separate buffer.
+ constexpr static size_t height_buffer_size = num_verts * sizeof(float);
+
+ /// The number of indices in the index buffer. Each triangle has 3 points, each square in the grid
+ /// is made up of 2 triangles. There are (width - 1) by (height - 1) squares in the grid.
+ constexpr static size_t num_indices = (grid_width - 1) * (grid_height - 1) * 3 * 2;
+ /// The number of bytes in the index buffer. Each index is encoded as a 32-bit int.
+ constexpr static size_t index_buffer_size = num_indices * sizeof(uint32_t);
+
+ /// An OpenGL handle to a Vertex Array Object, which has the grid and height buffers
+ /// bound to the corresponding attribute in the shader program (program) used for rendering.
+ GLuint vao;
+
+ /// Handle to the buffer that holds the indices for the triangles to render.
+ GLuint index_buffer;
+
+ /// Handle to the buffer that holds the x- and y-coordinates for each grid point.
+ GLuint grid_buffer;
+
+ /// Handle to the buffer that holds the heights each grid point. This buffer is shared with HIP.
+ GLuint height_buffer;
+
+ /// Handle to the OpenGL shader program that this example uses to render the triangles to the screen.
+ GLuint program;
+
+ /// Counters used to keep track of the rendering performance.
+ uint32_t fps_frame = 0;
+ std::chrono::high_resolution_clock::time_point fps_start_time;
+
+ /// \brief Initialize OpenGL rendering resources.
+ renderer()
+ {
+ // Create a vertex array used to bind the attribute buffers.
+ glGenVertexArrays(1, &this->vao);
+
+ // Also generate the buffers in question.
+ GLuint buffers[3];
+ glGenBuffers(std::size(buffers), buffers);
+ this->index_buffer = buffers[0];
+ this->grid_buffer = buffers[1];
+ this->height_buffer = buffers[2];
+
+ // Compile the shader program used to render the triangles.
+ this->program = compile_shader_program(vertex_shader, fragment_shader);
+
+ // Upload the initial data to the buffers.
+ this->initialize_buffer_data();
+
+ // Set up the VAO by binding the height and grid buffers to the attribute locations
+ // in the shader program.
+ glBindVertexArray(this->vao);
+
+ // Note - keep variable "in_height" in sync with shader.
+ glBindBuffer(GL_ARRAY_BUFFER, this->height_buffer);
+ const GLuint height_attrib = glGetAttribLocation(this->program, "in_height");
+ glVertexAttribPointer(height_attrib, 1, GL_FLOAT, GL_FALSE, 0, 0);
+ glEnableVertexAttribArray(height_attrib);
+
+ // Note - keep variable "in_xy" in sync with shader.
+ const GLuint grid_attrib = glGetAttribLocation(this->program, "in_xy");
+ glBindBuffer(GL_ARRAY_BUFFER, this->grid_buffer);
+ glVertexAttribPointer(grid_attrib, 2, GL_FLOAT, GL_FALSE, 0, 0);
+ glEnableVertexAttribArray(grid_attrib);
+
+ this->fps_start_time = std::chrono::high_resolution_clock::now();
+ }
+
+ renderer(const renderer&) = delete;
+ renderer& operator=(const renderer&) = delete;
+
+ renderer(renderer&&) = delete;
+ renderer& operator=(renderer&&) = delete;
+
+ ~renderer()
+ {
+ glDeleteProgram(this->program);
+ GLuint buffers[] = {this->index_buffer, this->grid_buffer, this->height_buffer};
+ glDeleteBuffers(std::size(buffers), buffers);
+ glDeleteVertexArrays(1, &this->vao);
+ }
+
+ /// \brief Upload the initial values for each buffer to Vulkan.
+ void initialize_buffer_data() const
+ {
+ // Initialize the height buffer.
+ glBindBuffer(GL_ARRAY_BUFFER, this->height_buffer);
+ // We do not need to fill it, as that is going to be done from HIP, but we
+ // do need to allocate it from OpenGL. This is done simply by passing `nullptr` as
+ // initial data pointer.
+ // GL_DYNAMIC_DRAW is passed because this buffer is going to be updated every frame,
+ // and is going to be used to hold vertex data for drawing - this may help the driver
+ // to render more efficiently.
+ glBufferData(GL_ARRAY_BUFFER, height_buffer_size, nullptr, GL_DYNAMIC_DRAW);
+
+ // Initialize the grid buffer.
+ {
+ glBindBuffer(GL_ARRAY_BUFFER, this->grid_buffer);
+ // Avoid having to allocate on host by allocating the buffer in OpenGL and then mapping it
+ // into host-memory to initialize it.
+ // This buffer is going to be initialized once and is going to be used for drawing,
+ // so pass GL_STATIC_DRAW as usage hint.
+ glBufferData(GL_ARRAY_BUFFER, grid_buffer_size, nullptr, GL_STATIC_DRAW);
+
+ float* grid = reinterpret_cast(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+ for(uint32_t y = 0; y < grid_height; ++y)
+ {
+ for(uint32_t x = 0; x < grid_width; ++x)
+ {
+ *grid++ = (2.0f * x) / (grid_width - 1) - 1;
+ *grid++ = (2.0f * y) / (grid_height - 1) - 1;
+ }
+ }
+
+ // Let OpenGL know that we are done with this buffer.
+ glUnmapBuffer(GL_ARRAY_BUFFER);
+ }
+
+ // Initialize the index buffer
+ {
+ glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->index_buffer);
+ // Similar as the grid buffer, this buffer is going to be initialized once and is then used
+ // for drawing.
+ glBufferData(GL_ELEMENT_ARRAY_BUFFER, index_buffer_size, nullptr, GL_STATIC_DRAW);
+
+ uint32_t* indices
+ = reinterpret_cast(glMapBuffer(GL_ELEMENT_ARRAY_BUFFER, GL_WRITE_ONLY));
+ for(uint32_t y = 0; y < grid_height - 1; ++y)
+ {
+ for(uint32_t x = 0; x < grid_width - 1; ++x)
+ {
+ *indices++ = (y + 0) * grid_width + (x + 0);
+ *indices++ = (y + 1) * grid_width + (x + 0);
+ *indices++ = (y + 0) * grid_width + (x + 1);
+ *indices++ = (y + 1) * grid_width + (x + 0);
+ *indices++ = (y + 1) * grid_width + (x + 1);
+ *indices++ = (y + 0) * grid_width + (x + 1);
+ }
+ }
+
+ glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER);
+ }
+ }
+
+ /// \brief Bind the OpenGL pipeline state for this renderer.
+ void bind() const
+ {
+ glBindVertexArray(this->vao);
+ glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->index_buffer);
+ glUseProgram(this->program);
+ }
+
+ /// \brief Draw the next frame to the window. This requires the render state be bound using
+ /// bind.
+ void draw()
+ {
+ glDrawElements(GL_TRIANGLES, num_indices, GL_UNSIGNED_INT, nullptr);
+
+ // Output a native performance measurement.
+ ++this->fps_frame;
+ const auto frame_time = std::chrono::high_resolution_clock::now();
+ const auto time_diff = frame_time - this->fps_start_time;
+ if(time_diff > std::chrono::seconds{5})
+ {
+ const auto time_diff_sec
+ = std::chrono::duration_cast>(time_diff).count();
+ std::cout << "Average FPS (over " << double_precision(time_diff_sec, 2, true)
+ << " seconds): " << double_precision(this->fps_frame / time_diff_sec, 2, true)
+ << " (" << double_precision((time_diff_sec * 1000) / this->fps_frame, 2, true)
+ << " ms per frame, " << this->fps_frame << " frames)" << std::endl;
+ this->fps_frame = 0;
+ this->fps_start_time = frame_time;
+ }
+ }
+};
+
+/// [Sphinx sinewave kernel start]
+/// \brief The main HIP kernel for this example - computes a simple sine wave over a
+/// 2-dimensional grid of points.
+/// \param height_map - the grid of points to compute a sine wave for. It is expected to be
+/// a \p grid_width by \p grid_height array packed into memory.(y on the inner axis).
+/// \param time - The current time relative to the start of the program.
+__global__ void sinewave_kernel(float* height_map, const float time)
+{
+ const float freq = 10.f;
+ const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+ const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+ const float u = (2.f * x) / grid_width - 1.f;
+ const float v = (2.f * y) / grid_height - 1.f;
+
+ if(x < grid_width && y < grid_height)
+ {
+ height_map[x * grid_width + y] = sinf(u * freq + time) * cosf(v * freq + time);
+ }
+}
+/// [Sphinx sinewave kernel end]
+
+/// \brief This structure contains the HIP state and functionality used to advance the simulation.
+/// Initializing a \p simulator fetches the OpenGL height buffer from the corresponding renderer,
+/// and imports it as a HIP device pointer. This pointer is then passed to the simulation kernel
+/// (sinewave_kernel), which updates the values in it. When renderer::draw is called,
+/// the updated values are read from the buffer in OpenGL and used to render the triangle grid.
+struct simulator
+{
+ /// The HIP stream used to advance the simulation. This must be created from an OpenGL-interop
+ /// capable device, see pick_hip_device.
+ hipStream_t hip_stream;
+ /// A HIP graphics resource that is imported from the OpenGL height buffer to simulate.
+ hipGraphicsResource_t hip_height_buffer;
+ /// A device pointer to the height buffer, imported from the OPenGL height buffer.
+ float* hip_height_ptr;
+
+ /// The start time of the program, used for the simulation.
+ std::chrono::high_resolution_clock::time_point start_time;
+
+ /// \brief Initialize a simulator, that uses a particular HIP device.
+ /// \param renderer - The renderer that will be used to render the example. Its height buffer
+ /// is imported to HIP for use with this simulator.
+ explicit simulator(const int hip_device, const renderer& renderer)
+ {
+ // Create a HIP stream for the target device.
+ HIP_CHECK(hipSetDevice(hip_device));
+ HIP_CHECK(hipStreamCreate(&this->hip_stream));
+
+ // [Sphinx buffer register and get start]
+ // Import the OpenGL height buffer into a HIP graphics resource.
+ HIP_CHECK(hipGraphicsGLRegisterBuffer(
+ &this->hip_height_buffer,
+ renderer.height_buffer,
+ // We are going to write to this buffer from HIP,
+ // but we do not need to read from it.
+ // As an optimization we can pass hipGraphicsRegisterFlagsWriteDiscard,
+ // so that the driver knows that we do not need the old values of
+ // the buffer.
+ hipGraphicsRegisterFlagsWriteDiscard));
+
+ // After importing the OpenGL height buffer into HIP, map it into HIP memory so that we can use it.
+ HIP_CHECK(hipGraphicsMapResources(1, &this->hip_height_buffer, this->hip_stream));
+
+ // Fetch the device pointer that points to the OpenGL buffer's memory.
+ // This function also fetches the size of the buffer. We already know it, but we still need to pass
+ // a valid pointer to hipGraphicsResourceGetMappedPointer.
+ size_t size;
+ HIP_CHECK(
+ hipGraphicsResourceGetMappedPointer(reinterpret_cast(&this->hip_height_ptr),
+ &size,
+ this->hip_height_buffer));
+ // [Sphinx buffer register and get end]
+
+ this->start_time = std::chrono::high_resolution_clock::now();
+ }
+
+ simulator(const simulator&) = delete;
+ simulator& operator=(const simulator&) = delete;
+
+ simulator(simulator&&) = delete;
+ simulator& operator=(simulator&&) = delete;
+
+ ~simulator()
+ {
+ // [Sphinx unregister start]
+ HIP_CHECK(hipStreamSynchronize(this->hip_stream));
+ HIP_CHECK(hipGraphicsUnmapResources(1, &this->hip_height_buffer, this->hip_stream));
+ HIP_CHECK(hipGraphicsUnregisterResource(this->hip_height_buffer));
+ HIP_CHECK(hipStreamDestroy(this->hip_stream));
+ // [Sphinx unregister end]
+ }
+
+ /// \brief Advance the simulation one step.
+ void step()
+ {
+ const auto now = std::chrono::high_resolution_clock::now();
+ const float time
+ = std::chrono::duration(now - this->start_time)
+ .count();
+
+ // [Sphinx buffer use in kernel start]
+ // The tile size to be used for each block of the computation. A tile is
+ // tile_size by tile_size threads in this case, since we are invoking the
+ // computation over a 2D-grid.
+ constexpr size_t tile_size = 8;
+
+ // Launch the HIP kernel to advance the simulation.
+ sinewave_kernel<<hip_stream>>>(this->hip_height_ptr, time);
+
+ // Check that no errors occured while launching the kernel.
+ HIP_CHECK(hipGetLastError());
+ // [Sphinx buffer use in kernel end]
+ }
+};
+
+/// \brief GLFW window resize callback: If the window is resized then we need to re-size
+/// the OpenGL viewport.
+void resize_callback(GLFWwindow* const window, const int width, const int height)
+{
+ (void)window;
+ glViewport(0, 0, width, height);
+}
+
+/// \brief Program entry point.
+int main()
+{
+ // The initial width of the GLFW window when the example is first started.
+ constexpr int initial_window_width = 1280;
+ // The initial height of the GLFW window.
+ constexpr int initial_window_height = 800;
+
+ // Initialize GLFW.
+ glfwSetErrorCallback(
+ [](int code, const char* const message)
+ { std::cerr << "A glfw error encountered: " << message << "(" << code << ")\n"; });
+
+ if(glfwInit() != GLFW_TRUE)
+ {
+ std::cerr << "failed to initialize GLFW\n";
+ return error_exit_code;
+ }
+
+ // Initialize the GLFW window used to render the example.
+ GLFWwindow* const window = create_window(initial_window_width, initial_window_height);
+
+ // Ensure that we are using the OpenGL context associated to the Window.
+ glfwMakeContextCurrent(window);
+
+ // [Sphinx opengl functions load start]
+ // Make GLFW use a custom loader - we need this for the more recent OpenGL functions,
+ // as these are not loaded by default on all platforms.
+ if(!gladLoadGLLoader(reinterpret_cast(glfwGetProcAddress)))
+ {
+ std::cerr << "Failed to load OpenGL function pointers" << std::endl;
+ return error_exit_code;
+ }
+ // [Sphinx opengl functions load end]
+
+ // Disable vsync.
+ glfwSwapInterval(0);
+
+ // If the OpenGL GL_ARB_debug_output extension is present, set a callback that is called
+ // whenever an OpenGL error occurs. This saves us calling glGetError after every OpenGL function.
+ if(GLAD_GL_ARB_debug_output)
+ {
+ glDebugMessageCallbackARB(
+ [](GLenum,
+ GLenum,
+ GLuint,
+ GLenum severity,
+ GLsizei length,
+ const GLchar* message,
+ const void*)
+ {
+ std::cerr << "[OpenGL] ";
+ std::cerr.write(message, length) << std::endl;
+ if(severity == GL_DEBUG_SEVERITY_HIGH_ARB)
+ {
+ std::exit(error_exit_code);
+ }
+ },
+ nullptr);
+ // We just want the errors: First disable all messaging, and then enable just the
+ // most severe ones.
+ glDebugMessageControlARB(GL_DONT_CARE, GL_DONT_CARE, GL_DONT_CARE, 0, NULL, GL_FALSE);
+ glDebugMessageControlARB(GL_DONT_CARE,
+ GL_DONT_CARE,
+ GL_DEBUG_SEVERITY_HIGH_ARB,
+ 0,
+ NULL,
+ GL_TRUE);
+ // Report errors synchronously instead of asynchronously.
+ glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB);
+ }
+
+ // Figure out which HIP device we need to use.
+ // This device needs to be interop-capable (see pick_hip_device).
+ const int hip_device = pick_hip_device();
+
+ // Let the user know which device we are using, on both the OpenGL and HIP sides.
+ hipDeviceProp_t hip_props;
+ HIP_CHECK(hipGetDeviceProperties(&hip_props, hip_device));
+ const GLubyte* const device_name = glGetString(GL_RENDERER);
+ std::cout << "Using device " << device_name << " (hip device " << hip_device
+ << ", compute capability " << hip_props.major << "." << hip_props.minor << ")\n";
+
+ // Sub-scope to call destructors before terminating GLFW.
+ {
+ renderer renderer;
+ simulator simulator(hip_device, renderer);
+
+ // There are no other renderers, so we can bind the OpenGL state once.
+ renderer.bind();
+
+ glfwSetFramebufferSizeCallback(window, resize_callback);
+ glClearColor(0, 0, 0, 1);
+
+ // The main rendering loop.
+ // Repeat for as long as the window is not closed.
+ while(glfwWindowShouldClose(window) == GLFW_FALSE)
+ {
+ glClear(GL_COLOR_BUFFER_BIT);
+
+ // First step the simulation so that the height buffer is ready
+ // for the next frame.
+ simulator.step();
+
+ // Draw the example to the window's framebuffer.
+ renderer.draw();
+
+ // Present the framebuffer on screen.
+ glfwSwapBuffers(window);
+ glfwPollEvents();
+ }
+ }
+
+ // Clean up GLFW.
+ glfwDestroyWindow(window);
+ glfwTerminate();
+}
diff --git a/docs/tools/update_example_codes.py b/docs/tools/update_example_codes.py
new file mode 100644
index 0000000000..32a4320750
--- /dev/null
+++ b/docs/tools/update_example_codes.py
@@ -0,0 +1,4 @@
+import urllib.request
+
+urllib.request.urlretrieve("https://raw.githubusercontent.com/ROCm/rocm-examples/refs/heads/develop/HIP-Basic/opengl_interop/main.hip", "docs/tools/example_codes/opengl_interop.hip")
+urllib.request.urlretrieve("https://raw.githubusercontent.com/ROCm/rocm-examples/refs/heads/develop/HIP-Basic/vulkan_interop/main.hip", "docs/tools/example_codes/external_interop.hip")
diff --git a/docs/tutorial/saxpy.rst b/docs/tutorial/saxpy.rst
index 91ecc10be7..c3dc766102 100644
--- a/docs/tutorial/saxpy.rst
+++ b/docs/tutorial/saxpy.rst
@@ -143,10 +143,12 @@ Retrieval of the result from the device is done much like input data copy. In th
HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost));
+.. _compiling_on_the_command_line:
+
Compiling on the command line
=============================
-.. _setting_up_the_command-line:
+.. _setting_up_the_command_line:
Setting up the command line
---------------------------
diff --git a/docs/understand/compilers.rst b/docs/understand/compilers.rst
new file mode 100644
index 0000000000..0cb400c63d
--- /dev/null
+++ b/docs/understand/compilers.rst
@@ -0,0 +1,95 @@
+.. meta::
+ :description: Compilation workflow of the HIP compilers.
+ :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API
+
+.. _hip_compilers:
+
+********************************************************************************
+HIP compilers
+********************************************************************************
+
+ROCm provides the compiler driver ``hipcc``, that can be used on AMD ROCm and
+NVIDIA CUDA platforms.
+
+On ROCm, ``hipcc`` takes care of the following:
+
+- Setting the default library and include paths for HIP
+- Setting some environment variables
+- Invoking the appropriate compiler - ``amdclang++``
+
+On NVIDIA CUDA platform, ``hipcc`` takes care of invoking compiler ``nvcc``.
+``amdclang++`` is based on the ``clang++`` compiler. For more
+details, see the :doc:`llvm project`.
+
+HIP compilation workflow
+================================================================================
+
+HIP provides a flexible compilation workflow that supports both offline compilation and runtime or just-in-time (JIT) compilation. Each approach has advantages depending on the use case, target architecture, and performance needs.
+
+The offline compilation is ideal for production environments, where the performance
+is critical and the target GPU architecture is known in advance.
+
+The runtime compilation is useful in development environments or when distributing
+software that must run on a wide range of hardware without the knowledge of the GPU in advance. It provides flexibility at the cost of some performance overhead.
+
+Offline compilation
+--------------------------------------------------------------------------------
+
+The HIP code compilation is performed in two stages: host and device code
+compilation stage.
+
+- Device-code compilation stage: The compiled device code is embedded into the
+ host object file. Depending on the platform, the device code can be compiled
+ into assembly or binary. ``nvcc`` and ``amdclang++`` target different
+ architectures and use different code object formats. ``nvcc`` uses the binary
+ ``cubin`` or the assembly PTX files, while the ``amdclang++`` path is the
+ binary ``hsaco`` format. On CUDA platforms, the driver compiles the PTX files
+ to executable code during runtime.
+
+- Host-code compilation stage: On the host side, ``hipcc`` or ``amdclang++`` can
+ compile the host code in one step without other C++ compilers. On the other
+ hand, ``nvcc`` only replaces the ``<<<...>>>`` kernel launch syntax with the
+ appropriate CUDA runtime function call and the modified host code is passed to
+ the default host compiler.
+
+For an example on how to compile HIP from the command line, see :ref:`SAXPY
+tutorial` .
+
+Runtime compilation
+--------------------------------------------------------------------------------
+
+HIP allows you to compile kernels at runtime using the ``hiprtc*`` API. Kernels
+are stored as a text string, which is passed to HIPRTC alongside options to
+guide the compilation.
+
+For more details, see
+:doc:`HIP runtime compiler <../how-to/hip_rtc>`.
+
+Static libraries
+================================================================================
+
+``hipcc`` supports generating two types of static libraries.
+
+- The first type of static library only exports and launches host functions
+ within the same library and not the device functions. This library type offers
+ the ability to link with a non-hipcc compiler such as ``gcc``. Additionally,
+ this library type contains host objects with device code embedded as fat
+ binaries. This library type is generated using the flag ``--emit-static-lib``:
+
+ .. code-block:: shell
+
+ hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
+ gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
+
+- The second type of static library exports device functions to be linked by
+ other code objects by using ``hipcc`` as the linker. This library type
+ contains relocatable device objects and is generated using ``ar``:
+
+ .. code-block:: shell
+
+ hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
+ ar rcsD libHipDevice.a hipDevice.o
+ hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
+
+For more information, see `HIP samples host functions `_
+and `device functions `_.
diff --git a/docs/understand/programming_model.rst b/docs/understand/programming_model.rst
index a4aa41fff7..b48778888d 100644
--- a/docs/understand/programming_model.rst
+++ b/docs/understand/programming_model.rst
@@ -2,7 +2,9 @@
:description: This chapter explains the HIP programming model, the contract
between the programmer and the compiler/runtime executing the
code, how it maps to the hardware.
- :keywords: AMD, ROCm, HIP, CUDA, API design
+ :keywords: ROCm, HIP, CUDA, API design, programming model
+
+.. _programming_model:
*******************************************************************************
HIP programming model
@@ -225,7 +227,7 @@ better than the defaults defined by the hardware.
The implicit groups defined by kernel launch parameters are still available
when working with cooperative groups.
-For further information, see :doc:`Cooperative groups `.
+For further information, see :doc:`Cooperative groups `.
Memory model
============
diff --git a/docs/what_is_hip.rst b/docs/what_is_hip.rst
new file mode 100644
index 0000000000..29cc8b5006
--- /dev/null
+++ b/docs/what_is_hip.rst
@@ -0,0 +1,97 @@
+.. meta::
+ :description: This chapter provides an introduction to the HIP API.
+ :keywords: AMD, ROCm, HIP, CUDA, C++ language extensions
+
+.. _intro-to-hip:
+
+*******************************************************************************
+What is HIP?
+*******************************************************************************
+
+The Heterogeneous-computing Interface for Portability (HIP) API is a C++ runtime API
+and kernel language that lets developers create portable applications running in heterogeneous systems,
+using CPUs and AMD GPUs or NVIDIA GPUs from a single source code. HIP provides a simple
+marshalling language to access either the AMD ROCM back-end, or NVIDIA CUDA back-end,
+to build and run application kernels.
+
+.. figure:: data/what_is_hip/hip.svg
+ :alt: HIP in an application.
+ :align: center
+
+* HIP is a thin API with little or no performance impact over coding directly
+ in NVIDIA CUDA or AMD :doc:`ROCm `.
+* HIP enables coding in a single-source C++ programming language including
+ features such as templates, C++11 lambdas, classes, namespaces, and more.
+* Developers can specialize for the platform (CUDA or ROCm) to tune for
+ performance or handle tricky cases.
+
+ROCm offers compilers (``clang``, ``hipcc``), code
+profilers (``rocprof``, ``omnitrace``), debugging tools (``rocgdb``), libraries
+and HIP with the runtime API and kernel language, to create heterogeneous applications
+running on both CPUs and GPUs. ROCm provides marshalling libraries like
+:doc:`hipFFT ` or :doc:`hipBLAS ` that act as a
+thin programming layer over either NVIDIA CUDA or AMD ROCm to enable support for
+either back-end. These libraries offer pointer-based memory interfaces and are
+easily integrated into your applications.
+
+HIP supports the ability to build and run on either AMD GPUs or NVIDIA GPUs.
+GPU Programmers familiar with NVIDIA CUDA or OpenCL will find the HIP API
+familiar and easy to use. Developers no longer need to choose between AMD or
+NVIDIA GPUs. You can quickly port your application to run on the available
+hardware while maintaining a single codebase. The :doc:`HIPify `
+tools, based on the clang front-end and Perl language, can convert CUDA API
+calls into the corresponding HIP API calls. However, HIP is not intended to be a
+drop-in replacement for CUDA, and developers should expect to do some manual
+coding and performance tuning work for AMD GPUs to port existing projects as
+described :doc:`HIP porting guide `.
+
+HIP provides two components: those that run on the CPU, also known as host
+system, and those that run on GPUs, also referred to as device. The host-based
+code is used to create device buffers, move data between the host application
+and a device, launch the device code (also known as kernel), manage streams and
+events, and perform synchronization. The kernel language provides a way to
+develop massively parallel programs that run on GPUs, and provides access to GPU
+specific hardware capabilities.
+
+In summary, HIP simplifies cross-platform development, maintains performance,
+and provides a familiar C++ experience for GPU programming that runs seamlessly
+on both AMD and NVIDIA GPUs.
+
+HIP components
+===============================================
+
+HIP consists of the following components. For information on the license
+associated with each component, see :doc:`HIP licensing `.
+
+C++ runtime API
+-----------------------------------------------
+
+For the AMD ROCm platform, HIP provides headers and a runtime library built on
+top of HIP-Clang compiler in the repository
+:doc:`Compute Language Runtime (CLR) `. The HIP runtime
+implements HIP streams, events, and memory APIs, and is an object library that
+is linked with the application. The source code for all headers and the library
+implementation is available on GitHub.
+
+For the NVIDIA CUDA platform, HIP provides headers that translate from the
+HIP runtime API to the CUDA runtime API. The host-side contains mostly inlined
+wrappers or even just preprocessor defines, with no additional overhead.
+The device-side code is compiled with ``nvcc``, just like normal CUDA kernels,
+and therefore one can expect the same performance as if directly coding in CUDA.
+The CUDA specific headers can be found in the `hipother repository `_.
+
+For further details, check :ref:`HIP Runtime API Reference `.
+
+Kernel language
+-----------------------------------------------
+
+HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in
+compute kernels (classes, namespaces, operator overloading, and templates). HIP also defines other
+language features that are designed to target accelerators, such as:
+
+* Short-vector headers that can serve on a host or device
+* Math functions that resemble those in ``math.h``, which is included with standard C++ compilers
+* Built-in functions for accessing specific GPU hardware capabilities
+
+For further details, check :doc:`C++ language extensions `
+and :doc:`C++ language support `.
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 9c2179bd75..ca0bb87d7a 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -717,7 +717,7 @@ enum hipLimit_t {
/** Allocates the memory as write-combined. On some system configurations, write-combined allocation
* may be transferred faster across the PCI Express bus, however, could have low read efficiency by
- * most CPUs. It's a good option for data tranfer from host to device via mapped pinned memory.*/
+ * most CPUs. It's a good option for data transfer from host to device via mapped pinned memory.*/
#define hipHostMallocWriteCombined 0x4
#define hipHostAllocWriteCombined 0x4
@@ -728,11 +728,11 @@ enum hipLimit_t {
#define hipHostMallocNumaUser 0x20000000
#define hipExtHostAllocNumaUser 0x20000000
-/** Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/
+/** Allocate coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
#define hipHostMallocCoherent 0x40000000
#define hipExtHostAllocCoherent 0x40000000
-/** Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/
+/** Allocate non-coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
#define hipHostMallocNonCoherent 0x80000000
#define hipExtHostAllocNonCoherent 0x80000000
@@ -3448,7 +3448,6 @@ hipError_t hipMemAllocHost(void** ptr, size_t size);
/**
* @}
*/
-
/**
* @brief Allocates device accessible page locked (pinned) host memory
*
@@ -3536,6 +3535,8 @@ hipError_t hipExtHostAlloc(void** ptr, size_t size, unsigned int flags);
*
* The API returns the allocation pointer, managed by HMM, can be used further to execute kernels
* on device and fetch data between the host and device as needed.
+ *
+ * If HMM is not supported, the function behaves the same as @p hipMallocHost .
*
* @note It is recommend to do the capability check before call this API.
*