From de3632d3a35c0a3bc942c403f073c30fa897386c Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuelfantao@gmail.com>
Date: Wed, 7 Sep 2022 13:43:58 +0100
Subject: [PATCH 01/76] Insert build system changes.

---
 src/Makefile                   |  6 +++
 src/chain/Makefile             | 13 +++++-
 src/configure                  | 79 ++++++++++++++++++++++++++++++++--
 src/cudamatrix/Makefile        | 13 +++++-
 src/makefiles/default_rules.mk | 10 ++++-
 src/nnet3/Makefile             |  7 ++-
 src/nnet3bin/Makefile          |  6 +++
 7 files changed, 127 insertions(+), 7 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 4d4efbc0172..bc4375e30f6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER)
 endif
 endif
 
+ifeq ($(ROCM), true)
+ifeq ($(WITH_CUDADECODER), true)
+SUBDIRS += $(CUDADECODER)
+endif
+endif
+
 SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
 SUBDIRS_BIN = $(filter     %bin, $(SUBDIRS))
 
diff --git a/src/chain/Makefile b/src/chain/Makefile
index fbad28f7de6..c4411f4b997 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test
 OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
           language-model.o chain-denominator.o chain-training.o \
           chain-generic-numerator.o
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
   OBJFILES += chain-kernels.o
 endif
 
@@ -28,7 +28,18 @@ ifeq ($(CUDA), true)
 endif
 
 # Implicit rule for kernel compilation,
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+endif
+ifeq ($(ROCM), true)
+#%.hip : %.cu
+#	$(HIPIFY) $< 1> $@ 2> $@.stats
+#%.o : %.hip
+#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+%.o : %.cu
+        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endif
+
 
 include ../makefiles/default_rules.mk
diff --git a/src/configure b/src/configure
index ed627eceedc..feb2fd276ad 100755
--- a/src/configure
+++ b/src/configure
@@ -74,6 +74,9 @@ Configuration options:
   --cudatk-dir=DIR      CUDA toolkit directory
   --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
          https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
+  --use-rocm            Build with ROCm
+  --rocm-dir=DIR        ROCM directory
+  --rocm-targets=TGTS   Comma separated list of GPU targets to target through ROCm
   --debug-level=N       Use assertion level 0 (disabled), 1, or 2 [default=1]
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
@@ -248,6 +251,63 @@ function check_for_slow_expf {
   fi
 }
 
+# ROCM is used only in selected directories including src/cudamatrix, src/nnet*
+# and src/chain*. It is used to accelerate the neural network training.
+# The rest of Kaldi runs on CPUs.
+
+function configure_rocm {
+  # Check for ROCM in the system
+  if [ ! -d "$ROCMDIR" ]; then
+    for base in $ROCM_PATH  /opt/rocm /usr/local/rocm /usr/; do
+      if [ -f $base/bin/hipcc ]; then
+        ROCMDIR=$base
+      fi
+    done
+  fi
+
+  if [ -d "$ROCMDIR" ]; then
+    if [ ! -f $ROCMDIR/bin/hipcc ]; then
+      failure "Cannnot find hipcc in ROCm directory $ROCMDIR"
+    fi
+  fi
+  echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
+  echo >> kaldi.mk
+  echo "# ROCm configuration" >> kaldi.mk
+  echo >> kaldi.mk
+  echo IS_GPU_BUILD = true >> kaldi.mk
+  echo ROCM = true">> kaldi.mk
+  echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
+  echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk 
+
+  echo "CUDA_ARCH = " >> kaldi.mk
+  echo "ROCM_ARCH_FLAGS = " >> kaldi.mk
+  for i in ${ROCM_TARGETS//,/ } ; do
+    echo "Targetting ROCm arch $i"
+    echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk
+  done
+  
+  echo "HOST_ARCH = `uname -m`" >> kaldi.mk
+  echo >> kaldi.mk
+
+  # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
+  # use direct calls to uname -m here
+  if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
+    cat makefiles/hip_64bit.mk >> kaldi.mk
+  else
+    echo "\
+WARNING: ROCM will not be used!
+         ROCM is only supported with 64-bit Linux builds."
+    exit 1;
+  fi
+
+  #add cusolver flags for newer toolkits
+  if [ "$CUSOLVER" == "true" ]; then
+    echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk
+  fi
+}
+
+
+
 # CUDA is used only in selected directories including src/cudamatrix, src/nnet*
 # and src/chain*. It is used to accelerate the neural network training.
 # The rest of Kaldi runs on CPUs.
@@ -371,6 +431,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
     echo "# CUDA configuration" >> kaldi.mk
     echo >> kaldi.mk
 
+    echo IS_GPU_BUILD = true >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
@@ -602,7 +663,8 @@ ENV_LDLIBS=$LDLIBS
 debug_level=1
 double_precision=false
 dynamic_kaldi=false
-use_cuda=true
+use_cuda=false
+use_rocm=false
 with_cudadecoder=true
 static_fst=false
 static_math=false
@@ -651,8 +713,11 @@ do
   --atlas-root=*)
     GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
-  --use-cuda)
-    use_cuda=true;
+  --use-rocm)
+    use_rocm=true;
+    shift ;;
+  --use-rocm=no)
+    use_rocm=false;
     shift ;;
   --use-cuda=yes)
     use_cuda=true;
@@ -729,6 +794,13 @@ do
   --mathlib=*)
     GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
+  --rocm-dir=*)
+    # ROCM is used in src/cudamatrix and src/nnet{,bin} only.
+    GetSwitchExistingPathOrDie ROCMDIR "$1"
+    shift ;;
+  --rocm-targets=*)
+    GetSwitchValueOrDie ROCM_TARGETS "$1"
+    shift ;;
   --cudatk-dir=*)
     # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
     GetSwitchExistingPathOrDie CUDATKDIR "$1"
@@ -1304,6 +1376,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
   $use_cuda && configure_cuda
+  $use_rocm && configure_rocm
   linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 45c2ba44fd7..31c7c5ef3e5 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
            cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
   OBJFILES += cu-kernels.o
 endif
 
@@ -27,8 +27,19 @@ ifeq ($(CUDA), true)
   endif
 endif
 
+ifeq ($(CUDA), true)
 # Implicit rule for kernel compilation,
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+endif
+
+ifeq ($(ROCM), true)
+#%.hip : %.cu
+#	$(HIPIFY) $< 1> $@ 2> $@.stats
+#%.o : %.hip
+#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+%.o : %.cu
+        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endifn
 
 include ../makefiles/default_rules.mk
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 3ae5ed5e2dd..c27b7b0a108 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -145,12 +145,17 @@ ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_ENABLED), true)
 CUDA_SRCS=$(wildcard *.cu)
 # Check if any CUDA .cu sources exist to run dependency commands on.
 ifneq ($(CUDA_SRCS),)
+ifeq ($(CUDA), true)
 NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS)
 endif
+ifeq ($(ROCM), true)
+HIPCC_DEP_COMMAND = $(HIPCC) -M $(ROCM_FLAGS) $(ROCM_INCLUDE) $(CUDA_SRCS)
+endif
+endif
 endif
 
 .PHONY: depend
@@ -162,6 +167,9 @@ endif
 ifneq ($(NVCC_DEP_COMMAND),)
 	-$(NVCC_DEP_COMMAND) >> .depend.mk
 endif
+ifneq ($(HIPCC_DEP_COMMAND),)
+	-$(HIPCC_DEP_COMMAND) >> .depend.mk
+endif
 
 # removing automatic making of "depend" as it's quite slow.
 #.depend.mk: depend
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 0bf1bebe096..b6c75ac7118 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -3,9 +3,14 @@ all:
 
 include ../kaldi.mk
 
+ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
-
+endif
+ifeq ($(ROCM), true)
+LDFLAGS += $(ROCM_LDFLAGS)
+LDLIBS += $(ROCM_LDLIBS)
+endif
 
 TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-descriptor-test nnet-parse-test nnet-component-test \
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 039fc258b13..2bd23273982 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -3,8 +3,14 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
+ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
+endif
+ifeq ($(ROCM), true)
+LDFLAGS += $(ROCM_LDFLAGS)
+LDLIBS += $(ROCM_LDLIBS)
+endif
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \

From 64c27545ce49357fe900de377eb266e9fe11f46d Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Wed, 7 Sep 2022 10:03:38 -0500
Subject: [PATCH 02/76] Remove extra quote.

---
 src/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index feb2fd276ad..21e439eeb4b 100755
--- a/src/configure
+++ b/src/configure
@@ -275,7 +275,7 @@ function configure_rocm {
   echo "# ROCm configuration" >> kaldi.mk
   echo >> kaldi.mk
   echo IS_GPU_BUILD = true >> kaldi.mk
-  echo ROCM = true">> kaldi.mk
+  echo ROCM = true >> kaldi.mk
   echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
   echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk 
 

From ee18146a6ce723de6c26a78890f6e83b484c0460 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Thu, 8 Sep 2022 07:05:47 -0500
Subject: [PATCH 03/76] Add hipify header.

---
 src/configure                |   3 +-
 src/cudamatrix/Makefile      |   4 +-
 src/cudamatrix/cu-device.cc  |   8 +-
 src/cudamatrix/cu-kernels.cu |   9 ++-
 src/hip/hipify.h             |  22 +++++
 src/hip/math_constants.h     | 152 +++++++++++++++++++++++++++++++++++
 src/makefiles/hip_64bit.mk   |  21 +++++
 7 files changed, 214 insertions(+), 5 deletions(-)
 create mode 100644 src/hip/hipify.h
 create mode 100644 src/hip/math_constants.h
 create mode 100644 src/makefiles/hip_64bit.mk

diff --git a/src/configure b/src/configure
index 21e439eeb4b..fa0b77373a0 100755
--- a/src/configure
+++ b/src/configure
@@ -258,9 +258,10 @@ function check_for_slow_expf {
 function configure_rocm {
   # Check for ROCM in the system
   if [ ! -d "$ROCMDIR" ]; then
-    for base in $ROCM_PATH  /opt/rocm /usr/local/rocm /usr/; do
+    for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
       if [ -f $base/bin/hipcc ]; then
         ROCMDIR=$base
+        break
       fi
     done
   fi
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 31c7c5ef3e5..512028c6c13 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -39,7 +39,7 @@ ifeq ($(ROCM), true)
 #%.o : %.hip
 #	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 %.o : %.cu
-        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-endifn
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 39bcf373ace..5bcb0552924 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,10 +23,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-
+#endif // __IS_HIP_COMPILE__
 #include <string>
 #include <vector>
 #include <algorithm>
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8044ff699bc..c644cbc0784 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -28,10 +28,17 @@
 #include <cfloat>
 #include <limits>
 #include <math_constants.h>
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hipify.h>
+#include "cudamatrix/cu-kernels-ansi.h"
+#include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_reduce.hpp>
+#else
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
-
+#endif //__IS_HIP_COMPILE__
 
 /***********************************************************************
  * Generic __device__ functions
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
new file mode 100644
index 00000000000..41b7a02cb04
--- /dev/null
+++ b/src/hip/hipify.h
@@ -0,0 +1,22 @@
+#ifndef __HIPIFY_H__
+#define __HIPIFY_H__
+
+inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
+
+//
+// HIP types
+//
+#define cudaDevAttrWarpSize     hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute  hipDeviceGetAttribute
+#define cudaGetDevice           hipGetDevice
+#define cudaStream_t            hipStream_t
+#define cudaStreamLegacy        ((hipStream_t)1)
+#define cudaStreamPerThread     ((hipStream_t)2)
+
+//
+// HIPCUB
+//
+#define cub hipcub
+
+
+#endif //__HIPIFY_H__
diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h
new file mode 100644
index 00000000000..7fb8fce8e71
--- /dev/null
+++ b/src/hip/math_constants.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__MATH_CONSTANTS_H__)
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+#define CUDART_INF_F            __int_as_float(0x7f800000)
+#define CUDART_NAN_F            __int_as_float(0x7fffffff)
+#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001)
+#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffff)
+#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000)
+#define CUDART_ZERO_F           0.0f
+#define CUDART_ONE_F            1.0f
+#define CUDART_SQRT_HALF_F      0.707106781f
+#define CUDART_SQRT_HALF_HI_F   0.707106781f
+#define CUDART_SQRT_HALF_LO_F   1.210161749e-08f
+#define CUDART_SQRT_TWO_F       1.414213562f
+#define CUDART_THIRD_F          0.333333333f
+#define CUDART_PIO4_F           0.785398163f
+#define CUDART_PIO2_F           1.570796327f
+#define CUDART_3PIO4_F          2.356194490f
+#define CUDART_2_OVER_PI_F      0.636619772f
+#define CUDART_SQRT_2_OVER_PI_F 0.797884561f
+#define CUDART_PI_F             3.141592654f
+#define CUDART_L2E_F            1.442695041f
+#define CUDART_L2T_F            3.321928094f
+#define CUDART_LG2_F            0.301029996f
+#define CUDART_LGE_F            0.434294482f
+#define CUDART_LN2_F            0.693147181f
+#define CUDART_LNT_F            2.302585093f 
+#define CUDART_LNPI_F           1.144729886f
+#define CUDART_TWO_TO_M126_F    1.175494351e-38f
+#define CUDART_TWO_TO_126_F     8.507059173e37f
+#define CUDART_NORM_HUGE_F      3.402823466e38f
+#define CUDART_TWO_TO_23_F      8388608.0f
+#define CUDART_TWO_TO_24_F      16777216.0f
+#define CUDART_TWO_TO_31_F      2147483648.0f
+#define CUDART_TWO_TO_32_F      4294967296.0f
+#define CUDART_REMQUO_BITS_F    3
+#define CUDART_REMQUO_MASK_F    (~((~0)<<CUDART_REMQUO_BITS_F))
+#define CUDART_TRIG_PLOSS_F     105615.0f
+
+/* double precision constants */
+#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
+#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
+#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
+#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
+#define CUDART_ZERO             0.0
+#define CUDART_ONE              1.0
+#define CUDART_SQRT_TWO         1.4142135623730951e+0
+#define CUDART_SQRT_HALF        7.0710678118654757e-1
+#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
+#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
+#define CUDART_THIRD            3.3333333333333333e-1
+#define CUDART_TWOTHIRD         6.6666666666666667e-1
+#define CUDART_PIO4             7.8539816339744828e-1
+#define CUDART_PIO4_HI          7.8539816339744828e-1
+#define CUDART_PIO4_LO          3.0616169978683830e-17
+#define CUDART_PIO2             1.5707963267948966e+0
+#define CUDART_PIO2_HI          1.5707963267948966e+0
+#define CUDART_PIO2_LO          6.1232339957367660e-17
+#define CUDART_3PIO4            2.3561944901923448e+0
+#define CUDART_2_OVER_PI        6.3661977236758138e-1
+#define CUDART_PI               3.1415926535897931e+0
+#define CUDART_PI_HI            3.1415926535897931e+0
+#define CUDART_PI_LO            1.2246467991473532e-16
+#define CUDART_SQRT_2PI         2.5066282746310007e+0
+#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
+#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
+#define CUDART_SQRT_PIO2        1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
+#define CUDART_SQRT_2OPI        7.9788456080286536e-1
+#define CUDART_L2E              1.4426950408889634e+0
+#define CUDART_L2E_HI           1.4426950408889634e+0
+#define CUDART_L2E_LO           2.0355273740931033e-17
+#define CUDART_L2T              3.3219280948873622e+0
+#define CUDART_LG2              3.0102999566398120e-1
+#define CUDART_LG2_HI           3.0102999566398120e-1
+#define CUDART_LG2_LO         (-2.8037281277851704e-18)
+#define CUDART_LGE              4.3429448190325182e-1
+#define CUDART_LGE_HI           4.3429448190325182e-1
+#define CUDART_LGE_LO           1.09831965021676510e-17
+#define CUDART_LN2              6.9314718055994529e-1
+#define CUDART_LN2_HI           6.9314718055994529e-1
+#define CUDART_LN2_LO           2.3190468138462996e-17
+#define CUDART_LNT              2.3025850929940459e+0
+#define CUDART_LNT_HI           2.3025850929940459e+0
+#define CUDART_LNT_LO         (-2.1707562233822494e-16)
+#define CUDART_LNPI             1.1447298858494002e+0
+#define CUDART_LN2_X_1024       7.0978271289338397e+2
+#define CUDART_LN2_X_1025       7.1047586007394398e+2
+#define CUDART_LN2_X_1075       7.4513321910194122e+2
+#define CUDART_LG2_X_1024       3.0825471555991675e+2
+#define CUDART_LG2_X_1075       3.2360724533877976e+2
+#define CUDART_TWO_TO_23        8388608.0
+#define CUDART_TWO_TO_52        4503599627370496.0
+#define CUDART_TWO_TO_53        9007199254740992.0
+#define CUDART_TWO_TO_54        18014398509481984.0
+#define CUDART_TWO_TO_M54       5.5511151231257827e-17
+#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
+#define CUDART_TRIG_PLOSS       2147483648.0
+#define CUDART_DBL2INT_CVT      6755399441055744.0
+
+#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
new file mode 100644
index 00000000000..453d9d5fe62
--- /dev/null
+++ b/src/makefiles/hip_64bit.mk
@@ -0,0 +1,21 @@
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef ROCMDIR
+$(error ROCMDIR not defined.)
+endif
+
+CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 \
+	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+
+ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)
+ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \
+             -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
+	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
+
+#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
+#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
+ROCM_LDFLAGS += 
+
+#CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
+ROCM_LDLIBS += 

From fd48017a509d61a4488849f8ec5ea9f0496d93cd Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Thu, 8 Sep 2022 18:07:47 -0500
Subject: [PATCH 04/76] Add more entries to hipificatiion header to deal with
 the BLAS routines.

---
 src/cudamatrix/cu-allocator.h    |   7 ++
 src/cudamatrix/cu-array-inl.h    |   5 ++
 src/cudamatrix/cu-common.h       |   9 +++
 src/cudamatrix/cu-device.h       |  14 +++-
 src/cudamatrix/cu-matrix.cc      |   6 ++
 src/cudamatrix/cublas-wrappers.h |  17 ++--
 src/hip/hipify.h                 | 129 +++++++++++++++++++++++++++++++
 src/makefiles/hip_64bit.mk       |   2 +-
 8 files changed, 181 insertions(+), 8 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index d7d65da806a..a3baa2fb33d 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -23,10 +23,17 @@
 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #endif
+#endif
 
 #include <map>
 #include <set>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 53de59fe4fc..36b829046ed 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -28,7 +28,12 @@
 #include <algorithm>
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
+#endif
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-kernels.h"
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 83f8a39a8b9..617f4363269 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -31,11 +31,20 @@
 
 #if HAVE_CUDA
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+//TODO: tests with ROCTX #include <roctracer/roctracer_roctx.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
 #include <nvToolsExt.h>
+#endif
 
 #define CU_SAFE_CALL(fun) \
 { \
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 2f278eb85b9..515fa4d7d25 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -28,14 +28,26 @@
 #include <string>
 #include <iostream>
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-
+#endif
 #if CUDA_VERSION >= 9010
+#ifdef __IS_HIP_COMPILE__
+#include <hipsolver.h>
+#else
 #include <cusolverDn.h>
+#endif
 #else
 // cusolver not supported.
 // Setting a few types to minimize compiler guards.
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c67842d38bf..a522f13451a 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -27,9 +27,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index 63dbe630568..dc5c0e0ced5 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -28,14 +28,17 @@
 namespace kaldi {
 #if HAVE_CUDA == 1
 
+#ifndef CUBLAS_R_32F
+#define CUBLAS_R_32F CUDA_R_32F
+#endif
 inline cublasStatus_t cublas_gemm(
     cublasHandle_t handle, cublasOperation_t transa,
     cublasOperation_t transb, int m, int n,int k, float alpha,
     const float *A, int lda, const float *B, int ldb, float beta,
     float *C, int ldc) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta,
-                      C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
+  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta,
+                      C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
                       CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
@@ -63,8 +66,8 @@ inline cublasStatus_t cublas_gemmBatched(
     const float *A[], int lda, const float *B[], int ldb, float beta,
     float *C[], int ldc, int batchCount) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F,  lda,
-                             (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount,
+  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F,  lda,
+                             (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount,
                              CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount);
@@ -219,6 +222,7 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo,
 // cuSPARSE wrappers
 //
 #if CUDA_VERSION >= 10020
+#ifndef __IS_HIP_COMPILE__
 inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int nnz, const void *csrVal,
                                          const int *csrRowPtr,
@@ -243,6 +247,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
 
   return status;
 }
+#endif
 
 inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle,
                                        cusparseOperation_t transA, 
@@ -319,7 +324,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues,
 			  idxBase);
@@ -336,7 +341,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues,
                           idxBase);
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 41b7a02cb04..697afc7a6d3 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -5,14 +5,143 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
 //
 // HIP types
+// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops.
 //
+
 #define cudaDevAttrWarpSize     hipDeviceAttributeWarpSize
 #define cudaDeviceGetAttribute  hipDeviceGetAttribute
 #define cudaGetDevice           hipGetDevice
+#define cudaGetErrorString      hipGetErrorString
 #define cudaStream_t            hipStream_t
 #define cudaStreamLegacy        ((hipStream_t)1)
 #define cudaStreamPerThread     ((hipStream_t)2)
+#define cublasStatus_t          hipblasStatus_t
+#define cudaError_t             hipError_t
+#define cusparseDestroy         hipsparseDestroy
+#define cudaGetLastError        hipGetLastError
 
+#define cudaFree  hipFree
+#define cudaGetErrorString hipGetErrorString
+#define cublasCreate hipblasCreate
+#define cublasSetStream hipblasSetStream
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define curandCreateGenerator hiprandCreateGenerator
+#define curandSetStream hiprandSetStream
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define curandDestroyGenerator hiprandDestroyGenerator
+#define cusparseDestroy hipsparseDestroy
+#define cudaDeviceProp hipDeviceProp_t
+#define cublasOperation_t hipblasOperation_t
+#define cublasStatus_t hipblasStatus_t
+#define cusparseStatus_t hipsparseStatus_t
+#define curandStatus_t hiprandStatus_t
+#define cublasHandle_t  hipblasHandle_t
+#define cusparseHandle_t hipsparseHandle_t
+#define curandGenerator_t hiprandGenerator_t
+#define cublasGemmAlgo_t hipblasGemmAlgo_t
+#define cusolverDnHandle_t  hipsolverDnHandle_t
+#define cublasComputeType_t hipblasDatatype_t
+#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
+#define curandSetGeneratorOffset hiprandSetGeneratorOffset
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cublasDaxpy_v2 hipblasDaxpy
+#define cublasSaxpy_v2 hipblasSaxpy
+#define cublasDscal_v2 hipblasDscal
+#define cublasSscal_v2 hipblasSscal
+#define cudaSetDevice hipSetDevice
+#define cudaSuccess hipSuccess
+#define cusolverDnCreate hipsolverDnCreate
+#define cusolverDnSetStream hipsolverDnSetStream
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define cusparseCreate hipsparseCreate
+#define cusolverDnDestroy hipsolverDnDestroy
+#define cusparseSetStream hipsparseSetStream
+#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing.
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaDeviceReset hipDeviceReset
+#define cudaComputeModeExclusive hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
+#define cudaErrorInvalidDevice hipErrorInvalidDevice
+#define cublasDestroy hipblasDestroy
+#define cuDeviceGetName hipDeviceGetName
+#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
+#define curandGenerateUniform hiprandGenerateUniform
+#define curandGenerateUniformDouble hiprandGenerateUniformDouble
+#define curandGenerateNormal hiprandGenerateNormal
+#define curandGenerateNormalDouble hiprandGenerateNormalDouble
+#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define cusparseMatDescr_t hipsparseMatDescr_t
+#define cudaMemsetAsync hipMemsetAsync
+#define cublasGemmEx hipblasGemmEx
+#define cublasDgemm_v2 hipblasDgemm
+#define cublasSger_v2 hipblasSger
+#define cublasDger_v2 hipblasDger
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasDgemmBatched hipblasDgemmBatched
+#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
+#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
+#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
+#define cublasFillMode_t hipblasFillMode_t
+#define cublasSsyrk_v2 hipblasSsyrk
+#define cublasDsyrk_v2 hipblasDsyrk
+#define cublasSdot_v2 hipblasSdot
+#define cublasSasum_v2 hipblasSasum
+#define cublasDnrm2_v2 hipblasDnrm2
+#define cublasScopy_v2 hipblasScopy
+#define cublasDcopy_v2 hipblasDcopy
+#define cublasSgemv_v2 hipblasSgemv
+#define cublasDgemv_v2 hipblasDgemv
+#define cublasSspmv_v2 hipblasSspmv
+#define cublasDspmv_v2 hipblasDspmv
+#define cublasDtpmv_v2 hipblasDtpmv
+#define cublasSspr_v2 hipblasSspr
+#define cublasDspr_v2 hipblasDspr
+#define cudaDataType hipDataType
+#define cusparseAction_t hipsparseAction_t
+#define cublasDdot_v2 hipblasDdot
+#define cublasDasum_v2 hipblasDasum
+#define cublasSnrm2_v2 hipblasSnrm2
+#define cublasStpmv_v2 hipblasStpmv
+#define cusparseIndexBase_t hipsparseIndexBase_t
+#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
+#define cusparseOperation_t hipsparseOperation_t
+#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
+#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
+#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
+#define cusparseCreateCsr hipsparseCreateCsr
+#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
+#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
+#define cusparseCreateDnMat hipsparseCreateDnMat
+#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
+#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
+#define cusparseSpMM hipsparseSpMM
+#define cusparseDestroySpMat hipsparseDestroySpMat
+#define cusparseDestroyDnMat hipsparseDestroyDnMat
+#define cusparseScsr2csc hipsparseScsr2csc
+#define CUDA_R_64F HIP_R_64F
+#define CUDA_R_32F HIP_R_32F
+#define CUBLAS_R_64F HIPBLAS_R_64F
+#define CUBLAS_R_32F HIPBLAS_R_32F
+#define cusparseDcsr2csc hipsparseDcsr2csc
+#define cusparseCreateMatDescr hipsparseCreateMatDescr
+#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemset2DAsync hipMemset2DAsync
 //
 // HIPCUB
 //
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 453d9d5fe62..b405d84a15b 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -5,7 +5,7 @@ ifndef ROCMDIR
 $(error ROCMDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 \
+CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
 	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
 ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)

From 07f2f36e398aa09a59a6655c212f8c1233f81216 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Thu, 8 Sep 2022 18:36:28 -0500
Subject: [PATCH 05/76] Cudmatrix hipification complete.

---
 src/cudamatrix/cu-allocator.cc         |  7 +++++
 src/cudamatrix/cu-array.cc             |  5 +++
 src/cudamatrix/cu-block-matrix.cc      |  6 ++++
 src/cudamatrix/cu-common.cc            |  5 +++
 src/cudamatrix/cu-compressed-matrix.cc |  6 ++++
 src/cudamatrix/cu-packed-matrix.cc     |  6 ++++
 src/cudamatrix/cu-sp-matrix.cc         |  6 ++++
 src/cudamatrix/cu-sparse-matrix.cc     |  6 ++++
 src/cudamatrix/cu-tp-matrix.cc         |  6 ++++
 src/cudamatrix/cu-vector.cc            |  6 ++++
 src/hip/hipify.h                       | 42 ++++++++++++++++++++++++++
 11 files changed, 101 insertions(+)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index e438c604509..8e08d3ef2a1 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -23,9 +23,16 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
+
 
 #include <string>
 #include <vector>
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 53eccdd44c5..2017ebce5c7 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -22,8 +22,13 @@
 #include <vector>
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index e0c64912207..a2bd910eba0 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -19,9 +19,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include <algorithm>
 #include "base/timer.h"
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 10fc00da681..585d980ed19 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -22,7 +22,12 @@
 
 #include "cudamatrix/cu-common.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipify.h>
+#else
 #include <cuda.h>
+#endif
 
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-matrixdim.h"
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index be02921169d..0a5537b4248 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -19,9 +19,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 756d580c7cf..f0563a6123f 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -21,9 +21,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index d1efc0cff9c..a328457ca11 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -19,9 +19,15 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 703aa40e735..c0ebddfc95e 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -22,9 +22,15 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include <utility>
 #include <vector>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 377c34239f0..6929911fb5e 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -19,9 +19,15 @@
 // limitations under the License.
 
 #if HAVE_CUDA==1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 8736782a3e0..fa5d94fb0bc 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -22,9 +22,15 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <hipify.h>
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 697afc7a6d3..10010ceb70f 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -139,9 +139,51 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusparseDestroyMatDescr hipsparseDestroyMatDescr
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_C HIPBLAS_OP_C
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemset2DAsync hipMemset2DAsync
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
+#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
+#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
+#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
+#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
+#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
+#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
+#define cudaMalloc hipMalloc
+#define cudaMallocPitch hipMallocPitch
+#define cuMemGetInfo_v2 hipMemGetInfo
+
 //
 // HIPCUB
 //

From fde6f7f478ce18af0142885fd625a33ce2946671 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 06:54:00 -0500
Subject: [PATCH 06/76] Ignore Eclipse synchronized project files.

---
 .gitignore                     | 4 ++++
 src/chain/Makefile             | 2 +-
 src/chain/chain-kernels-ansi.h | 4 ++++
 src/chain/chain-kernels.cu     | 5 +++++
 src/makefiles/hip_64bit.mk     | 8 +++-----
 5 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9f8c727d4d0..53a4079d9ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,3 +90,7 @@ venv/
 # CMakeLists.txt files are currently autogenerated, must not be committed.
 /src/**/CMakeLists.txt
 /build*
+
+# Eclipse sync project
+.ptp-sync
+.ptp-sync-folder
diff --git a/src/chain/Makefile b/src/chain/Makefile
index c4411f4b997..678bb03ef33 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -38,7 +38,7 @@ ifeq ($(ROCM), true)
 #%.o : %.hip
 #	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 %.o : %.cu
-        $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
 
 
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index f5814d7c11c..48c80cc8d92 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -22,6 +22,10 @@
 #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
 #include "chain/chain-datastruct.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#endif
+
 #if HAVE_CUDA == 1
 extern "C" {
 
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index a63944f0012..739b9005854 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -20,6 +20,11 @@
 #include <cfloat>
 #include "chain/chain-kernels-ansi.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hipify.h>
+#endif
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \
          configure with --use-cuda=no (this will disable the use of GPU).
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index b405d84a15b..6ca4ea7d1b6 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -13,9 +13,7 @@ ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
 	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
 
-#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
-ROCM_LDFLAGS += 
-
+#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
 #CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
-ROCM_LDLIBS += 
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64

From 21ca60dfeeee2496801869ee96667cfd73df4aa6 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 08:02:20 -0500
Subject: [PATCH 07/76] Hipify complete including NVTX.

---
 src/chain/chain-kernels.cu             |  1 -
 src/cudamatrix/cu-allocator.cc         |  2 +-
 src/cudamatrix/cu-allocator.h          |  2 +-
 src/cudamatrix/cu-block-matrix.cc      |  2 +-
 src/cudamatrix/cu-common.cc            | 84 ++++++++++++++------------
 src/cudamatrix/cu-common.h             |  2 +-
 src/cudamatrix/cu-compressed-matrix.cc |  2 +-
 src/cudamatrix/cu-device.cc            |  2 +-
 src/cudamatrix/cu-device.h             |  2 +-
 src/cudamatrix/cu-matrix.cc            |  2 +-
 src/cudamatrix/cu-packed-matrix.cc     |  2 +-
 src/cudamatrix/cu-sp-matrix.cc         |  2 +-
 src/cudamatrix/cu-sparse-matrix.cc     |  2 +-
 src/cudamatrix/cu-tp-matrix.cc         |  2 +-
 src/cudamatrix/cu-vector.cc            |  2 +-
 src/makefiles/hip_64bit.mk             |  7 ++-
 16 files changed, 65 insertions(+), 53 deletions(-)

diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 739b9005854..2a30128750c 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -22,7 +22,6 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime.h>
-#include <hipify.h>
 #endif
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 8e08d3ef2a1..82d682588d8 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index a3baa2fb33d..0cc1f7e6a4b 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hipify.h>
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index a2bd910eba0..04885296445 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 585d980ed19..6275bc9073a 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -25,8 +25,10 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipify.h>
+#define API_NAME_PREFIX "HIP"
 #else
 #include <cuda.h>
+#define API_NAME_PREFIX "CU"
 #endif
 
 #include "base/kaldi-common.h"
@@ -36,6 +38,9 @@ namespace kaldi {
 
 #ifdef USE_NVTX
 NvtxTracer::NvtxTracer(const char* name) {
+#ifdef __IS_HIP_COMPILE__
+  roctxRangePushA(name);
+#else
   const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
   const int num_colors = sizeof(colors)/sizeof(uint32_t);
   int color_id = ((int)name[0])%num_colors;
@@ -48,9 +53,14 @@ NvtxTracer::NvtxTracer(const char* name) {
 	eventAttrib.message.ascii = name;
 	nvtxRangePushEx(&eventAttrib);
   // nvtxRangePushA(name);
+#endif
 }
 NvtxTracer::~NvtxTracer() {
+#ifdef __IS_HIP_COMPILE__
+  roctxRangePop();
+#else
   nvtxRangePop();
+#endif
 }
 #endif
 
@@ -92,16 +102,16 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
 const char* cublasGetStatusStringK(cublasStatus_t status) {
   // Defined in CUDA include file: cublas.h or cublas_api.h
   switch(status) {
-    case CUBLAS_STATUS_SUCCESS:           return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:   return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:      return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:     return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:     return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:     return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:  return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:    return "CUBLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:     return "CUBLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:     return "CUBLAS_STATUS_LICENSE_ERROR";
+    case CUBLAS_STATUS_SUCCESS:           return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:   return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:     return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:     return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:  return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:    return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:     return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
   }
   return "CUBLAS_STATUS_UNKNOWN_ERROR";
 }
@@ -110,43 +120,43 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust
   // Defined in CUDA include file: cusparse.h
   switch(status) {
-    case CUSPARSE_STATUS_SUCCESS:                   return "CUSPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED:           return "CUSPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED:              return "CUSPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE:             return "CUSPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH:             return "CUSPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR:             return "CUSPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED:          return "CUSPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR:            return "CUSPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT:                return "CUSPARSE_STATUS_ZERO_PIVOT";
+    case CUSPARSE_STATUS_SUCCESS:                   return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:           return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:              return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:             return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:             return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:             return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:          return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:            return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_ZERO_PIVOT:                return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
     #if CUDA_VERSION >= 11000
-    case CUSPARSE_STATUS_NOT_SUPPORTED:             return "CUSPARSE_STATUS_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
+    case CUSPARSE_STATUS_NOT_SUPPORTED:             return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
     #endif
   }
-  return "CUSPARSE_STATUS_UNKNOWN_ERROR";
+  return "SPARSE_STATUS_UNKNOWN_ERROR";
 }
 
 const char* curandGetStatusString(curandStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
   // Defined in CUDA include file: curand.h
   switch(status) {
-    case CURAND_STATUS_SUCCESS:                     return "CURAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:            return "CURAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:             return "CURAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:           return "CURAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:                  return "CURAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:                return "CURAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:              return "CURAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:         return "CURAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:       return "CURAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:               return "CURAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:              return "CURAND_STATUS_INTERNAL_ERROR";
+    case CURAND_STATUS_SUCCESS:                     return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:            return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:             return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:           return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:                  return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:                return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:              return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:         return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:       return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:               return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:              return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
   }
-  return "CURAND_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
 }
 
 }  // namespace kaldi
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 617f4363269..a0c879414d4 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -36,7 +36,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
 #include <hipsparse/hipsparse.h>
-//TODO: tests with ROCTX #include <roctracer/roctracer_roctx.h>
+#include <roctracer/roctx.h>
 #include <hipify.h>
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index 0a5537b4248..de4fe6f8da2 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 5bcb0552924..41f8d6f83d5 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hipify.h>
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 515fa4d7d25..9286b6fe14a 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -44,7 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#include <hipsolver.h>
+#include <hipsolver/hipsolver.h>
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index a522f13451a..675ed74aeb4 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index f0563a6123f..5acfc7443c4 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a328457ca11..adfb3e0b517 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index c0ebddfc95e..45742571a41 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 6929911fb5e..51fb744a855 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index fa5d94fb0bc..62ff16cb7f9 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hipify.h>
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 6ca4ea7d1b6..0ff628d67f6 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -5,11 +5,14 @@ ifndef ROCMDIR
 $(error ROCMDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
+
+ROCM_USEROCTX = -DUSE_NVTX
+
+CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
 	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
 ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)
-ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \
+ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
 	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
 

From 104023482690fbdc92d1cb190a85de8b697f86be Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 09:21:01 -0500
Subject: [PATCH 08/76] Format files for the hipification.

---
 src/cudamatrix/cu-allocator.cc         |   2 +-
 src/cudamatrix/cu-allocator.h          |   2 +-
 src/cudamatrix/cu-array-inl.h          |   2 +-
 src/cudamatrix/cu-array.cc             |   2 +-
 src/cudamatrix/cu-block-matrix.cc      |   2 +-
 src/cudamatrix/cu-common.cc            |  13 +-
 src/cudamatrix/cu-common.h             |   2 +-
 src/cudamatrix/cu-compressed-matrix.cc |   2 +-
 src/cudamatrix/cu-device.cc            |   2 +-
 src/cudamatrix/cu-device.h             |   2 +-
 src/cudamatrix/cu-kernels.cu           |   2 +-
 src/cudamatrix/cu-matrix.cc            |   2 +-
 src/cudamatrix/cu-packed-matrix.cc     |   2 +-
 src/cudamatrix/cu-sp-matrix.cc         |   2 +-
 src/cudamatrix/cu-sparse-matrix.cc     |   2 +-
 src/cudamatrix/cu-tp-matrix.cc         |   2 +-
 src/cudamatrix/cu-vector.cc            |   2 +-
 src/hip/hipify.h                       | 347 ++++++++++++-------------
 src/makefiles/hip_64bit.mk             |   5 +-
 19 files changed, 198 insertions(+), 199 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 82d682588d8..abd08a9b015 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -26,7 +26,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 0cc1f7e6a4b..1ed7e54b541 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -27,7 +27,7 @@
 #include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 36b829046ed..1fd80502cf9 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -30,7 +30,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #endif
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 2017ebce5c7..333e8fbed1c 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #endif
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 04885296445..fd17fe61893 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 6275bc9073a..2e77062f20d 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -24,7 +24,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #define API_NAME_PREFIX "HIP"
 #else
 #include <cuda.h>
@@ -112,8 +112,12 @@ const char* cublasGetStatusStringK(cublasStatus_t status) {
     case CUBLAS_STATUS_INTERNAL_ERROR:    return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
     case CUBLAS_STATUS_NOT_SUPPORTED:     return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
     case CUBLAS_STATUS_LICENSE_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
+#ifdef __IS_HIP_COMPILE__
+    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_INVALID_ENUM:     return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+#endif
   }
-  return "CUBLAS_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR";
 }
 
 const char* cusparseGetStatusString(cusparseStatus_t status) {
@@ -135,7 +139,7 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
     case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
     #endif
   }
-  return "SPARSE_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR";
 }
 
 const char* curandGetStatusString(curandStatus_t status) {
@@ -155,6 +159,9 @@ const char* curandGetStatusString(curandStatus_t status) {
     case CURAND_STATUS_INITIALIZATION_FAILED:       return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
     case CURAND_STATUS_ARCH_MISMATCH:               return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
     case CURAND_STATUS_INTERNAL_ERROR:              return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
+#ifdef __IS_HIP_COMPILE__
+    case HIPRAND_STATUS_NOT_IMPLEMENTED:            return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
+#endif
   }
   return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
 }
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index a0c879414d4..da7c57bde36 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -37,7 +37,7 @@
 #include <hiprand/hiprand.h>
 #include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index de4fe6f8da2..e42c93f1b67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 41f8d6f83d5..705bfbeee59 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -27,7 +27,7 @@
 #include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 9286b6fe14a..d7edf5a5a1c 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -34,7 +34,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
 #include <hipsparse/hipsparse.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index c644cbc0784..9a99f19b58f 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -30,7 +30,7 @@
 #include <math_constants.h>
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime.h>
-#include <hipify.h>
+#include "hipify.h"
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <hipcub/hipcub.hpp>
 #include <hipcub/block/block_reduce.hpp>
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 675ed74aeb4..c1d72ede87e 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -30,7 +30,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 5acfc7443c4..c9d686d0ce8 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -24,7 +24,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index adfb3e0b517..a6c7d7720e4 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 45742571a41..a21e5163701 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -25,7 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 51fb744a855..378cc8e4e38 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -22,7 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 62ff16cb7f9..cf13d631a0d 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -25,7 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
-#include <hipify.h>
+#include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 10010ceb70f..89daad6bc28 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -5,187 +5,180 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
 //
 // HIP types
-// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops.
 //
-
-#define cudaDevAttrWarpSize     hipDeviceAttributeWarpSize
-#define cudaDeviceGetAttribute  hipDeviceGetAttribute
-#define cudaGetDevice           hipGetDevice
-#define cudaGetErrorString      hipGetErrorString
-#define cudaStream_t            hipStream_t
-#define cudaStreamLegacy        ((hipStream_t)1)
-#define cudaStreamPerThread     ((hipStream_t)2)
-#define cublasStatus_t          hipblasStatus_t
-#define cudaError_t             hipError_t
-#define cusparseDestroy         hipsparseDestroy
-#define cudaGetLastError        hipGetLastError
-
-#define cudaFree  hipFree
-#define cudaGetErrorString hipGetErrorString
-#define cublasCreate hipblasCreate
-#define cublasSetStream hipblasSetStream
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define curandCreateGenerator hiprandCreateGenerator
-#define curandSetStream hiprandSetStream
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define curandDestroyGenerator hiprandDestroyGenerator
-#define cusparseDestroy hipsparseDestroy
-#define cudaDeviceProp hipDeviceProp_t
-#define cublasOperation_t hipblasOperation_t
-#define cublasStatus_t hipblasStatus_t
-#define cusparseStatus_t hipsparseStatus_t
-#define curandStatus_t hiprandStatus_t
-#define cublasHandle_t  hipblasHandle_t
-#define cusparseHandle_t hipsparseHandle_t
-#define curandGenerator_t hiprandGenerator_t
-#define cublasGemmAlgo_t hipblasGemmAlgo_t
-#define cusolverDnHandle_t  hipsolverDnHandle_t
-#define cublasComputeType_t hipblasDatatype_t
-#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
-#define curandSetGeneratorOffset hiprandSetGeneratorOffset
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cublasDaxpy_v2 hipblasDaxpy
-#define cublasSaxpy_v2 hipblasSaxpy
-#define cublasDscal_v2 hipblasDscal
-#define cublasSscal_v2 hipblasSscal
-#define cudaSetDevice hipSetDevice
-#define cudaSuccess hipSuccess
-#define cusolverDnCreate hipsolverDnCreate
-#define cusolverDnSetStream hipsolverDnSetStream
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define cusparseCreate hipsparseCreate
-#define cusolverDnDestroy hipsolverDnDestroy
-#define cusparseSetStream hipsparseSetStream
-#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
-#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing.
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaDeviceReset hipDeviceReset
-#define cudaComputeModeExclusive hipComputeModeExclusive
-#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
-#define cudaErrorInvalidDevice hipErrorInvalidDevice
-#define cublasDestroy hipblasDestroy
-#define cuDeviceGetName hipDeviceGetName
-#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
-#define curandGenerateUniform hiprandGenerateUniform
-#define curandGenerateUniformDouble hiprandGenerateUniformDouble
-#define curandGenerateNormal hiprandGenerateNormal
-#define curandGenerateNormalDouble hiprandGenerateNormalDouble
-#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
-#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
-#define cusparseMatDescr_t hipsparseMatDescr_t
-#define cudaMemsetAsync hipMemsetAsync
-#define cublasGemmEx hipblasGemmEx
-#define cublasDgemm_v2 hipblasDgemm
-#define cublasSger_v2 hipblasSger
-#define cublasDger_v2 hipblasDger
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasDgemmBatched hipblasDgemmBatched
-#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
-#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
-#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
-#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
-#define cublasFillMode_t hipblasFillMode_t
-#define cublasSsyrk_v2 hipblasSsyrk
-#define cublasDsyrk_v2 hipblasDsyrk
-#define cublasSdot_v2 hipblasSdot
-#define cublasSasum_v2 hipblasSasum
-#define cublasDnrm2_v2 hipblasDnrm2
-#define cublasScopy_v2 hipblasScopy
-#define cublasDcopy_v2 hipblasDcopy
-#define cublasSgemv_v2 hipblasSgemv
-#define cublasDgemv_v2 hipblasDgemv
-#define cublasSspmv_v2 hipblasSspmv
-#define cublasDspmv_v2 hipblasDspmv
-#define cublasDtpmv_v2 hipblasDtpmv
-#define cublasSspr_v2 hipblasSspr
-#define cublasDspr_v2 hipblasDspr
-#define cudaDataType hipDataType
-#define cusparseAction_t hipsparseAction_t
-#define cublasDdot_v2 hipblasDdot
-#define cublasDasum_v2 hipblasDasum
-#define cublasSnrm2_v2 hipblasSnrm2
-#define cublasStpmv_v2 hipblasStpmv
-#define cusparseIndexBase_t hipsparseIndexBase_t
-#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
-#define cusparseOperation_t hipsparseOperation_t
-#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
-#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
-#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
-#define cusparseCreateCsr hipsparseCreateCsr
-#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
-#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
-#define cusparseCreateDnMat hipsparseCreateDnMat
-#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
-#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
-#define cusparseSpMM hipsparseSpMM
-#define cusparseDestroySpMat hipsparseDestroySpMat
-#define cusparseDestroyDnMat hipsparseDestroyDnMat
-#define cusparseScsr2csc hipsparseScsr2csc
-#define CUDA_R_64F HIP_R_64F
-#define CUDA_R_32F HIP_R_32F
-#define CUBLAS_R_64F HIPBLAS_R_64F
-#define CUBLAS_R_32F HIPBLAS_R_32F
-#define cusparseDcsr2csc hipsparseDcsr2csc
-#define cusparseCreateMatDescr hipsparseCreateMatDescr
-#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_C HIPBLAS_OP_C
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemset2DAsync hipMemset2DAsync
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
-#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
-#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
-#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
-#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
-#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
-#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
-#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
+#define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_C                               HIPBLAS_OP_C
+#define CUBLAS_OP_N                               HIPBLAS_OP_N
+#define CUBLAS_OP_N                               HIPBLAS_OP_N
+#define CUBLAS_OP_T                               HIPBLAS_OP_T
+#define CUBLAS_R_32F                              HIPBLAS_R_32F
+#define CUBLAS_R_64F                              HIPBLAS_R_64F
+#define CUBLAS_SIDE_LEFT                          HIPBLAS_SIDE_LEFT
+#define CUBLAS_STATUS_ALLOC_FAILED                HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_ARCH_MISMATCH               HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_EXECUTION_FAILED            HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR              HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_INVALID_VALUE               HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_LICENSE_ERROR               HIPBLAS_STATUS_UNKNOWN
+#define CUBLAS_STATUS_MAPPING_ERROR               HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_NOT_INITIALIZED             HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_NOT_SUPPORTED               HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_SUCCESS                     HIPBLAS_STATUS_SUCCESS
+#define CUDA_R_32F                                HIP_R_32F
+#define CUDA_R_64F                                HIP_R_64F
+#define CURAND_RNG_PSEUDO_DEFAULT                 HIPRAND_RNG_PSEUDO_DEFAULT
+#define CURAND_STATUS_ALLOCATION_FAILED           HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH               HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED   HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR              HIPRAND_STATUS_INTERNAL_ERROR
+#define CURAND_STATUS_LAUNCH_FAILURE              HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE         HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_NOT_INITIALIZED             HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_OUT_OF_RANGE                HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_PREEXISTING_FAILURE         HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_SUCCESS                     HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_TYPE_ERROR                  HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_VERSION_MISMATCH            HIPRAND_STATUS_VERSION_MISMATCH
+#define CUSPARSE_ACTION_NUMERIC                   HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_32I                        HIPSPARSE_INDEX_32I
+#define CUSPARSE_INDEX_BASE_ZERO                  HIPSPARSE_INDEX_BASE_ZERO
+#define CUSPARSE_OPERATION_NON_TRANSPOSE          HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE              HIPSPARSE_OPERATION_TRANSPOSE
+#define CUSPARSE_ORDER_COL                        HIPSPARSE_ORDER_COLUMN
+#define CUSPARSE_SPMM_CSR_ALG2                    HIPSPARSE_SPMM_CSR_ALG2
+#define CUSPARSE_STATUS_ALLOC_FAILED              HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_ARCH_MISMATCH             HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_EXECUTION_FAILED          HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CUSPARSE_STATUS_INTERNAL_ERROR            HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_INVALID_VALUE             HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_MAPPING_ERROR             HIPSPARSE_STATUS_MAPPING_ERROR
 #define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
-#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
-#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
-#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
-#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
-#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
-#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
-#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
-#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
-#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
-#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
-#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
-#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
-#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
-#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
-#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
-#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
-#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
-#define cudaMalloc hipMalloc
-#define cudaMallocPitch hipMallocPitch
-#define cuMemGetInfo_v2 hipMemGetInfo
+#define CUSPARSE_STATUS_NOT_INITIALIZED           HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_NOT_SUPPORTED             HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_SUCCESS                   HIPSPARSE_STATUS_SUCCESS
+#define CUSPARSE_STATUS_ZERO_PIVOT                HIPSPARSE_STATUS_ZERO_PIVOT
+#define cuDeviceGetName                           hipDeviceGetName
+#define cuMemGetInfo_v2                           hipMemGetInfo
+#define cublasComputeType_t                       hipblasDatatype_t
+#define cublasCreate                              hipblasCreate
+#define cublasDasum_v2                            hipblasDasum
+#define cublasDaxpy_v2                            hipblasDaxpy
+#define cublasDcopy_v2                            hipblasDcopy
+#define cublasDdot_v2                             hipblasDdot
+#define cublasDestroy                             hipblasDestroy
+#define cublasDgemmBatched                        hipblasDgemmBatched
+#define cublasDgemm_v2                            hipblasDgemm
+#define cublasDgemv_v2                            hipblasDgemv
+#define cublasDger_v2                             hipblasDger
+#define cublasDnrm2_v2                            hipblasDnrm2
+#define cublasDscal_v2                            hipblasDscal
+#define cublasDspmv_v2                            hipblasDspmv
+#define cublasDspr_v2                             hipblasDspr
+#define cublasDsyrk_v2                            hipblasDsyrk
+#define cublasDtpmv_v2                            hipblasDtpmv
+#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
+#define cublasFillMode_t                          hipblasFillMode_t
+#define cublasGemmAlgo_t                          hipblasGemmAlgo_t
+#define cublasGemmBatchedEx                       hipblasGemmBatchedEx
+#define cublasGemmEx                              hipblasGemmEx
+#define cublasHandle_t                            hipblasHandle_t
+#define cublasOperation_t                         hipblasOperation_t
+#define cublasSasum_v2                            hipblasSasum
+#define cublasSaxpy_v2                            hipblasSaxpy
+#define cublasScopy_v2                            hipblasScopy
+#define cublasSdot_v2                             hipblasSdot
+#define cublasSetStream                           hipblasSetStream
+#define cublasSgemv_v2                            hipblasSgemv
+#define cublasSger_v2                             hipblasSger
+#define cublasSnrm2_v2                            hipblasSnrm2
+#define cublasSscal_v2                            hipblasSscal
+#define cublasSspmv_v2                            hipblasSspmv
+#define cublasSspr_v2                             hipblasSspr
+#define cublasSsyrk_v2                            hipblasSsyrk
+#define cublasStatus_t                            hipblasStatus_t
+#define cublasStatus_t                            hipblasStatus_t
+#define cublasStpmv_v2                            hipblasStpmv
+#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
+#define cudaComputeModeExclusive                  hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess           hipComputeModeExclusiveProcess
+#define cudaDataType                              hipDataType
+#define cudaDevAttrWarpSize                       hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute                    hipDeviceGetAttribute
+#define cudaDeviceProp                            hipDeviceProp_t
+#define cudaDeviceReset                           hipDeviceReset
+#define cudaDeviceSynchronize                     hipDeviceSynchronize
+#define cudaErrorDeviceAlreadyInUse               hipErrorContextAlreadyInUse
+#define cudaErrorInvalidDevice                    hipErrorInvalidDevice
+#define cudaError_t                               hipError_t
+#define cudaFree                                  hipFree
+#define cudaGetDevice                             hipGetDevice
+#define cudaGetDeviceCount                        hipGetDeviceCount
+#define cudaGetDeviceProperties                   hipGetDeviceProperties
+#define cudaGetErrorString                        hipGetErrorString
+#define cudaGetErrorString                        hipGetErrorString
+#define cudaGetLastError                          hipGetLastError
+#define cudaMalloc                                hipMalloc
+#define cudaMallocPitch                           hipMallocPitch
+#define cudaMemcpy2DAsync                         hipMemcpy2DAsync
+#define cudaMemcpyAsync                           hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice                    hipMemcpyHostToDevice
+#define cudaMemset2DAsync                         hipMemset2DAsync
+#define cudaMemsetAsync                           hipMemsetAsync
+#define cudaSetDevice                             hipSetDevice
+#define cudaStreamLegacy                          ((hipStream_t)1)
+#define cudaStreamPerThread                       ((hipStream_t)2)
+#define cudaStreamSynchronize                     hipStreamSynchronize
+#define cudaStream_t                              hipStream_t
+#define cudaSuccess                               hipSuccess
+#define curandCreateGenerator                     hiprandCreateGenerator
+#define curandDestroyGenerator                    hiprandDestroyGenerator
+#define curandGenerateNormal                      hiprandGenerateNormal
+#define curandGenerateNormalDouble                hiprandGenerateNormalDouble
+#define curandGenerateUniform                     hiprandGenerateUniform
+#define curandGenerateUniformDouble               hiprandGenerateUniformDouble
+#define curandGenerator_t                         hiprandGenerator_t
+#define curandSetGeneratorOffset                  hiprandSetGeneratorOffset
+#define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
+#define curandSetStream                           hiprandSetStream
+#define curandStatus_t                            hiprandStatus_t
+#define cusolverDnCreate                          hipsolverDnCreate
+#define cusolverDnDestroy                         hipsolverDnDestroy
+#define cusolverDnHandle_t                        hipsolverDnHandle_t
+#define cusolverDnSetStream                       hipsolverDnSetStream
+#define cusparseAction_t                          hipsparseAction_t
+#define cusparseCreate                            hipsparseCreate
+#define cusparseCreateCsr                         hipsparseCreateCsr
+#define cusparseCreateDnMat                       hipsparseCreateDnMat
+#define cusparseCreateMatDescr                    hipsparseCreateMatDescr
+#define cusparseDcsr2csc                          hipsparseDcsr2csc
+#define cusparseDestroy                           hipsparseDestroy
+#define cusparseDestroy                           hipsparseDestroy
+#define cusparseDestroyDnMat                      hipsparseDestroyDnMat
+#define cusparseDestroyMatDescr                   hipsparseDestroyMatDescr
+#define cusparseDestroySpMat                      hipsparseDestroySpMat
+#define cusparseDnMatDescr_t                      hipsparseDnMatDescr_t
+#define cusparseGetMatIndexBase                   hipsparseGetMatIndexBase
+#define cusparseHandle_t                          hipsparseHandle_t
+#define cusparseIndexBase_t                       hipsparseIndexBase_t
+#define cusparseMatDescr_t                        hipsparseMatDescr_t
+#define cusparseOperation_t                       hipsparseOperation_t
+#define cusparseScsr2csc                          hipsparseScsr2csc
+#define cusparseSetStream                         hipsparseSetStream
+#define cusparseSpMM                              hipsparseSpMM
+#define cusparseSpMM_bufferSize                   hipsparseSpMM_bufferSize
+#define cusparseSpMatDescr_t                      hipsparseSpMatDescr_t
+#define cusparseStatus_t                          hipsparseStatus_t
 
 //
-// HIPCUB
+// HIPCUB namespace.
 //
 #define cub hipcub
 
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 0ff628d67f6..0c558a770d6 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -5,8 +5,8 @@ ifndef ROCMDIR
 $(error ROCMDIR not defined.)
 endif
 
-
-ROCM_USEROCTX = -DUSE_NVTX
+# Uncomment if willing to use ROCTX capabilities.
+# ROCM_USEROCTX = -DUSE_NVTX
 
 CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
 	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
@@ -18,5 +18,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
 
 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-#CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
 CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64

From 801115d710904ca505e318e9cd9cc3ffa7fc0f87 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 9 Sep 2022 09:57:45 -0500
Subject: [PATCH 09/76] Add hipification entries dropped by mistake.

---
 src/hip/hipify.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 89daad6bc28..7a0300ae02b 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -7,9 +7,12 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 // HIP types
 //
 #define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_16F               HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
 #define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
 #define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
 #define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP             HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs.
 #define CUBLAS_OP_C                               HIPBLAS_OP_C
 #define CUBLAS_OP_N                               HIPBLAS_OP_N
 #define CUBLAS_OP_N                               HIPBLAS_OP_N
@@ -146,6 +149,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define curandGenerateUniformDouble               hiprandGenerateUniformDouble
 #define curandGenerator_t                         hiprandGenerator_t
 #define curandSetGeneratorOffset                  hiprandSetGeneratorOffset
+#define curandSetGeneratorOrdering(x,y)           0 // HIP does not support generator ordeing.
 #define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
 #define curandSetStream                           hiprandSetStream
 #define curandStatus_t                            hiprandStatus_t

From 081de1ebcc44b846c4953bb3923818d6142b90cc Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 12 Sep 2022 06:06:19 -0500
Subject: [PATCH 10/76] Change IS_GPU_ENABLED to IS_GPU_BUILD in depends build.

---
 src/makefiles/default_rules.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index c27b7b0a108..21a3b053639 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -145,7 +145,7 @@ ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
-ifeq ($(IS_GPU_ENABLED), true)
+ifeq ($(IS_GPU_BUILD), true)
 CUDA_SRCS=$(wildcard *.cu)
 # Check if any CUDA .cu sources exist to run dependency commands on.
 ifneq ($(CUDA_SRCS),)

From 00098bf097ca7e9e804562c937b20c6714adf2f8 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 12 Sep 2022 17:11:35 -0500
Subject: [PATCH 11/76] Add build logic for ROCm < 5.2.0.

---
 src/configure              | 28 +++++++++++++++++++++-------
 src/hip/hipify.h           | 21 +++++++++++++++++++++
 src/makefiles/hip_64bit.mk | 17 ++++++++++++-----
 3 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/src/configure b/src/configure
index fa0b77373a0..ffb87abe106 100755
--- a/src/configure
+++ b/src/configure
@@ -259,7 +259,7 @@ function configure_rocm {
   # Check for ROCM in the system
   if [ ! -d "$ROCMDIR" ]; then
     for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
-      if [ -f $base/bin/hipcc ]; then
+      if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then
         ROCMDIR=$base
         break
       fi
@@ -268,7 +268,7 @@ function configure_rocm {
 
   if [ -d "$ROCMDIR" ]; then
     if [ ! -f $ROCMDIR/bin/hipcc ]; then
-      failure "Cannnot find hipcc in ROCm directory $ROCMDIR"
+      failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR"
     fi
   fi
   echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
@@ -289,7 +289,20 @@ function configure_rocm {
   
   echo "HOST_ARCH = `uname -m`" >> kaldi.mk
   echo >> kaldi.mk
-
+  
+  ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1)
+  echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk
+  ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
+  echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk
+  
+  # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use 
+  # __HIP_PLATFORM_AMD__ others  __HIP_PLATFORM_HCC__
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
+    echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk
+  else
+    echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk
+  fi
+  
   # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
   # use direct calls to uname -m here
   if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
@@ -300,10 +313,11 @@ WARNING: ROCM will not be used!
          ROCM is only supported with 64-bit Linux builds."
     exit 1;
   fi
-
-  #add cusolver flags for newer toolkits
-  if [ "$CUSOLVER" == "true" ]; then
-    echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk
+  
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
+    echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk
+  else
+    echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk
   fi
 }
 
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 7a0300ae02b..bdefa9cc4dd 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -3,6 +3,20 @@
 
 inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
+
+#undef hipLaunchKernelGGLInternal
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__);         \
+    } while (0)
+#else
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__);         \
+    } while (0)
+#endif
+
 //
 // HIP types
 //
@@ -153,10 +167,17 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
 #define curandSetStream                           hiprandSetStream
 #define curandStatus_t                            hiprandStatus_t
+#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5
 #define cusolverDnCreate                          hipsolverDnCreate
 #define cusolverDnDestroy                         hipsolverDnDestroy
 #define cusolverDnHandle_t                        hipsolverDnHandle_t
 #define cusolverDnSetStream                       hipsolverDnSetStream
+#else
+#define cusolverDnCreate                          hipsolverCreate
+#define cusolverDnDestroy                         hipsolverDestroy
+#define cusolverDnHandle_t                        hipsolverHandle_t
+#define cusolverDnSetStream                       hipsolverSetStream
+#endif
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 0c558a770d6..3976624032d 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -8,13 +8,20 @@ endif
 # Uncomment if willing to use ROCTX capabilities.
 # ROCM_USEROCTX = -DUSE_NVTX
 
-CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \
-	    -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+# Specific HIP/ROCm components should be included prior to the generic include to avoid
+# deprecation warnings.
+CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+            -D__IS_HIP_COMPILE__=1 \
+            -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
+            -DCUDA_VERSION=11000 \
+	          -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
-ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC)
+ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC)
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
-             -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
-	     -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread
+             -D__IS_HIP_COMPILE__=1 \
+             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
+             -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
+	           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14
 
 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib

From 9b8dffb3a594293fbf4286233df610ae6041b284 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 12 Sep 2022 17:33:16 -0500
Subject: [PATCH 12/76] Complete ROCm 5.0.2 build with no per-thread streams
 yet.

---
 src/cudamatrix/cu-allocator.cc         | 2 +-
 src/cudamatrix/cu-allocator.h          | 2 +-
 src/cudamatrix/cu-block-matrix.cc      | 2 +-
 src/cudamatrix/cu-common.h             | 6 +++---
 src/cudamatrix/cu-compressed-matrix.cc | 2 +-
 src/cudamatrix/cu-device.cc            | 2 +-
 src/cudamatrix/cu-device.h             | 8 ++++----
 src/cudamatrix/cu-matrix.cc            | 2 +-
 src/cudamatrix/cu-packed-matrix.cc     | 2 +-
 src/cudamatrix/cu-sp-matrix.cc         | 2 +-
 src/cudamatrix/cu-sparse-matrix.cc     | 2 +-
 src/cudamatrix/cu-tp-matrix.cc         | 2 +-
 src/cudamatrix/cu-vector.cc            | 2 +-
 src/makefiles/hip_64bit.mk             | 2 +-
 14 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index abd08a9b015..3b47ee525eb 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 1ed7e54b541..09ba2c9aa13 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fd17fe61893..309d68fccf7 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index da7c57bde36..99165cc592f 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,10 +32,10 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
+#include <hiprand.h>
+#include <hipsparse.h>
 #include <roctracer/roctx.h>
 #include "hipify.h"
 #else
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index e42c93f1b67..dfcbf41d131 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 705bfbeee59..c073ab358ea 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index d7edf5a5a1c..1311668ec33 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,11 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
+#include <hiprand.h>
+#include <hipsparse.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -44,7 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#include <hipsolver/hipsolver.h>
+#include <hipsolver.h>
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c1d72ede87e..96c1ef14ed4 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c9d686d0ce8..8a5865f71af 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a6c7d7720e4..fabd06c9b16 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index a21e5163701..3853ffa7e45 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 378cc8e4e38..dd3a333c9a5 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index cf13d631a0d..cc6332ba48c 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
+#include <hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 3976624032d..160f5fb5c0f 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -10,7 +10,7 @@ endif
 
 # Specific HIP/ROCm components should be included prior to the generic include to avoid
 # deprecation warnings.
-CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
             -D__IS_HIP_COMPILE__=1 \
             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
             -DCUDA_VERSION=11000 \

From e84d8f072496c9427e804f8189854da9ff49c04b Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 13 Sep 2022 07:44:43 -0500
Subject: [PATCH 13/76] Add cudadecoder support for ROCm 5.2.x.

---
 src/chain/Makefile                            | 16 ++++--
 src/configure                                 |  9 +++-
 src/cudadecoder/Makefile                      | 22 +++++++-
 .../batched-static-nnet3-kernels.cu           |  5 ++
 .../batched-static-nnet3-kernels.h            |  5 ++
 ...hed-threaded-nnet3-cuda-online-pipeline.cc |  5 ++
 .../batched-threaded-nnet3-cuda-pipeline.cc   |  5 ++
 .../batched-threaded-nnet3-cuda-pipeline2.cc  |  5 ++
 src/cudadecoder/cuda-decoder-kernels-utils.h  |  4 +-
 src/cudadecoder/cuda-decoder-kernels.cu       |  6 +++
 src/cudadecoder/cuda-decoder.cc               | 24 +++++----
 src/cudadecoder/cuda-decoder.h                |  5 ++
 src/cudadecoder/cuda-fst.cc                   |  6 +++
 src/cudadecoderbin/Makefile                   |  4 +-
 .../batched-wav-nnet3-cuda-online.cc          |  6 +++
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc  |  6 +++
 src/cudadecoderbin/batched-wav-nnet3-cuda2.cc |  7 +++
 src/cudafeat/Makefile                         | 23 +++++++-
 ...eature-online-batched-cmvn-cuda-kernels.cu |  5 ++
 ...ure-online-batched-ivector-cuda-kernels.cu |  5 ++
 .../feature-online-batched-ivector-cuda.cc    | 16 ++++++
 ...re-online-batched-spectral-cuda-kernels.cu |  6 +++
 .../feature-online-batched-spectral-cuda.h    |  5 ++
 src/cudafeat/feature-online-cmvn-cuda.cu      |  8 +++
 src/cudafeat/feature-spectral-cuda.cu         |  6 +++
 src/cudafeat/feature-spectral-cuda.h          |  5 ++
 src/cudafeat/feature-window-cuda.cu           |  5 ++
 .../online-batched-feature-pipeline-cuda.cc   |  7 ++-
 .../online-batched-feature-pipeline-cuda.h    |  4 ++
 .../online-ivector-feature-cuda-kernels.cu    |  6 +++
 src/cudafeat/online-ivector-feature-cuda.cc   | 14 ++++-
 src/cudamatrix/Makefile                       | 16 ++++--
 src/cudamatrix/cu-allocator.cc                |  2 +-
 src/cudamatrix/cu-allocator.h                 |  2 +-
 src/cudamatrix/cu-block-matrix.cc             |  2 +-
 src/cudamatrix/cu-common.h                    |  6 +--
 src/cudamatrix/cu-compressed-matrix.cc        |  2 +-
 src/cudamatrix/cu-device.cc                   |  2 +-
 src/cudamatrix/cu-device.h                    |  8 +--
 src/cudamatrix/cu-kernels.cu                  |  1 +
 src/cudamatrix/cu-matrix.cc                   |  2 +-
 src/cudamatrix/cu-packed-matrix.cc            |  2 +-
 src/cudamatrix/cu-sp-matrix.cc                |  2 +-
 src/cudamatrix/cu-sparse-matrix.cc            |  2 +-
 src/cudamatrix/cu-tp-matrix.cc                |  2 +-
 src/cudamatrix/cu-vector.cc                   |  2 +-
 src/hip/hipify.h                              | 54 ++++++++++++++-----
 src/makefiles/hip_64bit.mk                    | 18 +++++--
 48 files changed, 318 insertions(+), 62 deletions(-)

diff --git a/src/chain/Makefile b/src/chain/Makefile
index 678bb03ef33..5cc8d8901a1 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -33,13 +33,21 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 endif
 ifeq ($(ROCM), true)
-#%.hip : %.cu
-#	$(HIPIFY) $< 1> $@ 2> $@.stats
-#%.o : %.hip
-#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
+endif
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/configure b/src/configure
index ffb87abe106..ca3df9563ab 100755
--- a/src/configure
+++ b/src/configure
@@ -316,8 +316,9 @@ WARNING: ROCM will not be used!
   
   if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
     echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk
+    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk
   else
-    echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk
+    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk
   fi
 }
 
@@ -1055,7 +1056,11 @@ if $use_cuda; then
    fi
    echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
 else
-   echo "WITH_CUDADECODER = false" >> kaldi.mk
+   if $use_rocm; then
+     echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
+   else
+     echo "WITH_CUDADECODER = false" >> kaldi.mk
+   fi
 fi
 echo >> kaldi.mk
 
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index e2569e89ab7..062e9a47d41 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -3,13 +3,15 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 TESTFILES =
 
@@ -34,8 +36,26 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+ifeq ($(ROCM), true)
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+else
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+endif
+endif
 
 else
 all:
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu
index f02a78ed1af..429d9f72326 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.cu
+++ b/src/cudadecoder/batched-static-nnet3-kernels.cu
@@ -17,6 +17,11 @@
 
 #include "cudadecoder/batched-static-nnet3-kernels.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "hipify.h"
+#endif
+
 #include <stdio.h>
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
index 45064e15071..0bcb1997576 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.h
+++ b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -17,7 +17,12 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 #include "base/kaldi-types.h"
 
 #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index 6e78d7212fd..c7012b686e0 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -21,7 +21,12 @@
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include <mutex>
 #include <numeric>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index 89e93e5d98c..d5cf7dae2d7 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -26,7 +26,12 @@
 
 #include <memory>
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-fst.h"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index c076910672a..f6a3455db01 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -23,7 +23,12 @@
 
 #include <atomic>
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
index fc0d2cddd2c..add66312817 100644
--- a/src/cudadecoder/cuda-decoder-kernels-utils.h
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
   value.i2 = val;
   if (old.i2.x <= val.x) return;
   do {
-    assumed = old;
+    assumed.ull = old.ull;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
   } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
 }
@@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) {
   UInt64UnionInt2 old, assumed, value;
   old.ull = *ptr64;
   do {
-    assumed = old;
+    assumed.ull = old.ull;
     value.i2.x = assumed.i2.x - sub.x;
     value.i2.y = assumed.i2.y - sub.y;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index 3a835d02b76..6a14371911d 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -15,7 +15,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include "float.h"
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cuda-decoder-kernels.h"
 #include "cuda-decoder-kernels-utils.h"
 
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 1ec456ac32c..06dceae73a5 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -37,8 +37,14 @@
 #include <utility>
 #include <vector>
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <nvToolsExt.h>
+#endif
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-decoder-kernels.h"
@@ -184,35 +190,35 @@ void CudaDecoder::AllocateDeviceData() {
 void CudaDecoder::AllocateHostData() {
   channel_to_compute_.resize(nlanes_);
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_and_acoustic_cost_concat_,
+      (void**)&h_extra_and_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_acoustic_cost_concat_,
+      (void**)&h_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_prev_tokens_concat_,
+      (void**)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_infotoken_concat_,
+	  (void**)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_,
+      cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_,
                      nlanes_ * main_q_capacity_ *
                          sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_acoustic_cost_concat_tmp_,
+	  (void**)&h_acoustic_cost_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_prev_tokens_concat_tmp_,
+	  (void**)&h_extra_prev_tokens_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_infotoken_concat_tmp_,
+	  (void**)&h_infotoken_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
   h_lanes_counters_.Resize(
       nlanes_ + 1,
       1);  // +1 because we sometimes need last+1 value (for offsets)
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+	  (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
 
   h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
   h_all_tokens_acoustic_cost_.resize(nchannels_);
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index de2bd09f47c..510904aa004 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -20,7 +20,12 @@
 
 #if HAVE_CUDA
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 
 #include <atomic>
 #include <cfloat>
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 56066ee069d..3af37eb7676 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -22,8 +22,14 @@
 #include "cudadecoder/cuda-fst.h"
 #include "cudamatrix/cu-common.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <nvToolsExt.h>
+#endif
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 1f093299eb4..96b00c06101 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -2,13 +2,15 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index 1aba7144af1..56368853df2 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -23,9 +23,15 @@
 #error CUDA support must be configured to compile this binary.
 #endif
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "roctracer/roctx.h"
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
+#endif
 
 #include <algorithm>
 #include <iomanip>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 46138116bd8..05af50d7a3b 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -17,9 +17,15 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "roctracer/roctx.h"
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
+#endif
 #include <sstream>
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
 #include "cudamatrix/cu-allocator.h"
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index 992b34598d2..c14571f2ed9 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -18,9 +18,16 @@
 #include <atomic>
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
+#endif
 
 #include <sstream>
 
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index 54bcc53af1e..c3a4489e18e 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -2,13 +2,15 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 TESTFILES =
 
@@ -37,9 +39,26 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
-
+endif
+ifeq ($(ROCM), true)
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+else
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+endif
+endif
 else
 all:
 		$(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]")
diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index c839548d6eb..09b0caff255 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -15,7 +15,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h"
 
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index 0b57d6a32ea..0b4cfce812c 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -16,7 +16,12 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 538e268dd98..6d68c93f917 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -15,6 +15,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include "hipify.h"
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
+#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
+// The BLAS enumerators are used instead of the SOLVER ones.
+#ifdef CUBLAS_FILL_MODE_LOWER
+#undef CUBLAS_FILL_MODE_LOWER
+#endif
+#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#ifdef CUDA_R_32F
+#undef CUDA_R_32F
+#endif
+#define CUDA_R_32F HIPBLAS_R_32F
+#endif
+
 #include "cudafeat/feature-online-batched-ivector-cuda.h"
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index c43adaccc2e..f847311d755 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -17,8 +17,14 @@
 
 #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
 #include <nvToolsExt.h>
+#endif
 
 #include "cudafeat/lane-desc.h"
 #include "cudamatrix/cu-rand.h"
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index e4549c7177c..113657ce317 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -19,8 +19,13 @@
 #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipfft/hipfft.h>
+#include "hipify.h"
+#else
 #include <cufft.h>
 #endif
+#endif
 
 #include "cudafeat/feature-spectral-cuda.h"
 #include "cudafeat/feature-window-cuda.h"
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index ba13b4fe484..8d4648d04bb 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -15,11 +15,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
+
 #include "cudafeat/feature-online-cmvn-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 
+#ifndef __IS_HIP_COMPILE__
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   float2 retval;
   retval.x = a.x - b.x;
@@ -32,6 +39,7 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   retval.y = a.y + b.y;
   return retval;
 }
+#endif
 
 #if __CUDA_ARCH__ == 750
 __launch_bounds__ (1024, 1)
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index 3912661c4fd..c320c85a029 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -17,8 +17,14 @@
 
 #include "cudafeat/feature-spectral-cuda.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
 #include <cub/cub.cuh>
+#endif
 
 #include "cudamatrix/cu-rand.h"
 
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 8683372098c..5625592a717 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -19,8 +19,13 @@
 #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipfft/hipfft.h>
+#include "hipify.h"
+#else
 #include <cufft.h>
 #endif
+#endif
 
 #include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index b8db5bd46d3..6ba45e682c1 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -17,7 +17,12 @@
 
 #include "cudafeat/feature-window-cuda.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include "matrix/matrix-functions.h"
 
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index 981345404f5..650b51ec3c7 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -20,7 +20,12 @@
 
 #include "cudafeat/online-batched-feature-pipeline-cuda.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+#else
 #include <nvToolsExt.h>
+#endif
 
 namespace kaldi {
 
@@ -95,7 +100,7 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   current_samples_stash_ = new int32_t[num_channels_];
 
   // allocated pinned memory for storing channel desc
-  CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_));
+  CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
 
   // allocate device memory
   lanes_ =
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h
index fa000f03b62..6c588c40c24 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.h
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h
@@ -23,6 +23,10 @@
 #include <string>
 #include <vector>
 
+#ifdef __IS_HIP_COMPILE__
+#include "hipify.h"
+#endif
+
 #include "base/kaldi-error.h"
 #include "feat/feature-window.h"
 #include "matrix/matrix-lib.h"
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index 12d9b071f59..378ea18e689 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -15,7 +15,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
+
 #include "cudafeat/online-ivector-feature-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index bd4964860e0..c3b15d72a5b 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -16,8 +16,19 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+#include "hipify.h"
+// The BLAS enumerators are used instead of the SOLVER ones.
+#ifdef CUBLAS_FILL_MODE_LOWER
+#undef CUBLAS_FILL_MODE_LOWER
+#endif
+#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#else
 #include <nvToolsExt.h>
 #endif
+#endif
+
 #include <iostream>
 
 #include "base/io-funcs.h"
@@ -288,13 +299,14 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
   // Forming new non-SP matrix for cusolver.
   CuMatrix<float> A(quadratic);
 
+
+
 #ifdef CHOLESKY
   // query temp buffer size
   int L_work;
   CUSOLVER_SAFE_CALL(
       cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER,
                                   A.NumRows(), A.Data(), A.Stride(), &L_work));
-
   // allocate temp buffer
   float *workspace = static_cast<float *>(
       CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 512028c6c13..5cd4adcffd8 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -34,12 +34,20 @@ ifeq ($(CUDA), true)
 endif
 
 ifeq ($(ROCM), true)
-#%.hip : %.cu
-#	$(HIPIFY) $< 1> $@ 2> $@.stats
-#%.o : %.hip
-#	$(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
+.PRECIOUS: %.hip
+%.hip : %.cu
+	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	cat $< | \
+	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
+	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
+	cat > $@
+%.o : %.hip
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
+endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index 3b47ee525eb..abd08a9b015 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 09ba2c9aa13..1ed7e54b541 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 309d68fccf7..fd17fe61893 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 99165cc592f..da7c57bde36 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,10 +32,10 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand.h>
-#include <hipsparse.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
 #include "hipify.h"
 #else
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index dfcbf41d131..e42c93f1b67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index c073ab358ea..705bfbeee59 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 1311668ec33..d7edf5a5a1c 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,11 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
-#include <hiprand.h>
-#include <hipsparse.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -44,7 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#include <hipsolver.h>
+#include <hipsolver/hipsolver.h>
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9a99f19b58f..1d6e0664541 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -23,6 +23,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+
 // In this file is the CUDA code of the CUDA kernels, plus the ANSI-C wrappers
 
 #include <cfloat>
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 96c1ef14ed4..c1d72ede87e 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 8a5865f71af..c9d686d0ce8 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index fabd06c9b16..a6c7d7720e4 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 3853ffa7e45..a21e5163701 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index dd3a333c9a5..378cc8e4e38 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index cc6332ba48c..cf13d631a0d 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index bdefa9cc4dd..24b5f2f8eb3 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -1,29 +1,22 @@
 #ifndef __HIPIFY_H__
 #define __HIPIFY_H__
 
+#ifdef __HIPCC__
 inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
-
-
-#undef hipLaunchKernelGGLInternal
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
-    do {                                                                                           \
-        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__);         \
-    } while (0)
+// AMDGCN only support this rounding mode.
+#define __fdiv_rd __fdiv_rn
 #else
-#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
-    do {                                                                                           \
-        kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__);         \
-    } while (0)
+#define __align__(x) __attribute__((aligned (x)))
 #endif
 
 //
 // HIP types
 //
 #define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
 #define CUBLAS_COMPUTE_32F_FAST_16F               HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
 #define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_LOWER                    HIPBLAS_FILL_MODE_LOWER
 #define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
 #define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP             HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs.
@@ -46,6 +39,8 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define CUBLAS_STATUS_SUCCESS                     HIPBLAS_STATUS_SUCCESS
 #define CUDA_R_32F                                HIP_R_32F
 #define CUDA_R_64F                                HIP_R_64F
+#define CUFFT_R2C                                 HIPFFT_R2C
+#define CUFFT_SUCCESS                             HIPFFT_SUCCESS
 #define CURAND_RNG_PSEUDO_DEFAULT                 HIPRAND_RNG_PSEUDO_DEFAULT
 #define CURAND_STATUS_ALLOCATION_FAILED           HIPRAND_STATUS_ALLOCATION_FAILED
 #define CURAND_STATUS_ARCH_MISMATCH               HIPRAND_STATUS_ARCH_MISMATCH
@@ -104,6 +99,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cublasGemmAlgo_t                          hipblasGemmAlgo_t
 #define cublasGemmBatchedEx                       hipblasGemmBatchedEx
 #define cublasGemmEx                              hipblasGemmEx
+#define cublasGemmStridedBatchedEx                hipblasGemmStridedBatchedEx
 #define cublasHandle_t                            hipblasHandle_t
 #define cublasOperation_t                         hipblasOperation_t
 #define cublasSasum_v2                            hipblasSasum
@@ -133,15 +129,29 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaErrorDeviceAlreadyInUse               hipErrorContextAlreadyInUse
 #define cudaErrorInvalidDevice                    hipErrorInvalidDevice
 #define cudaError_t                               hipError_t
+#define cudaEventCreate                           hipEventCreate
+#define cudaEventCreateWithFlags                  hipEventCreateWithFlags
+#define cudaEventDestroy                          hipEventDestroy
+#define cudaEventDisableTiming                    hipEventDisableTiming
+#define cudaEventRecord                           hipEventRecord
+#define cudaEventSynchronize                      hipEventSynchronize
+#define cudaEvent_t                               hipEvent_t
 #define cudaFree                                  hipFree
+#define cudaFreeHost                              hipFreeHost
 #define cudaGetDevice                             hipGetDevice
 #define cudaGetDeviceCount                        hipGetDeviceCount
 #define cudaGetDeviceProperties                   hipGetDeviceProperties
+#define cudaGetErrorName                          hipGetErrorName
 #define cudaGetErrorString                        hipGetErrorString
 #define cudaGetErrorString                        hipGetErrorString
 #define cudaGetLastError                          hipGetLastError
+#define cudaHostRegister                          hipHostRegister
+#define cudaHostRegisterDefault                   hipHostRegisterDefault
+#define cudaHostUnregister                        hipHostUnregister
 #define cudaMalloc                                hipMalloc
+#define cudaMallocHost                            hipHostMalloc
 #define cudaMallocPitch                           hipMallocPitch
+#define cudaMemcpy                                hipMemcpy
 #define cudaMemcpy2DAsync                         hipMemcpy2DAsync
 #define cudaMemcpyAsync                           hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
@@ -150,11 +160,20 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaMemset2DAsync                         hipMemset2DAsync
 #define cudaMemsetAsync                           hipMemsetAsync
 #define cudaSetDevice                             hipSetDevice
+#define cudaStreamCreate                          hipStreamCreate
+#define cudaStreamDestroy                         hipStreamDestroy
 #define cudaStreamLegacy                          ((hipStream_t)1)
 #define cudaStreamPerThread                       ((hipStream_t)2)
 #define cudaStreamSynchronize                     hipStreamSynchronize
+#define cudaStreamWaitEvent                       hipStreamWaitEvent
 #define cudaStream_t                              hipStream_t
 #define cudaSuccess                               hipSuccess
+#define cufftComplex                              hipfftComplex
+#define cufftDestroy                              hipfftDestroy
+#define cufftExecR2C                              hipfftExecR2C
+#define cufftHandle                               hipfftHandle
+#define cufftPlanMany                             hipfftPlanMany
+#define cufftSetStream                            hipfftSetStream
 #define curandCreateGenerator                     hiprandCreateGenerator
 #define curandDestroyGenerator                    hiprandDestroyGenerator
 #define curandGenerateNormal                      hiprandGenerateNormal
@@ -178,6 +197,11 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusolverDnHandle_t                        hipsolverHandle_t
 #define cusolverDnSetStream                       hipsolverSetStream
 #endif
+#define cusolverDnSpotrf                          hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs                          hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
@@ -201,7 +225,9 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusparseSpMM_bufferSize                   hipsparseSpMM_bufferSize
 #define cusparseSpMatDescr_t                      hipsparseSpMatDescr_t
 #define cusparseStatus_t                          hipsparseStatus_t
-
+#define nvtxRangePop                              roctxRangePop
+#define nvtxRangePush                             roctxRangePush
+#define nvtxRangePushA                            roctxRangePushA
 //
 // HIPCUB namespace.
 //
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 160f5fb5c0f..e2f43ecd55c 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -14,9 +14,21 @@ CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
             -D__IS_HIP_COMPILE__=1 \
             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
             -DCUDA_VERSION=11000 \
-	          -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+	          -I$(ROCMDIR)/hipsparse/include \
+	          -I$(ROCMDIR)/hipfft/include \
+	          -I$(ROCMDIR)/hipblas/include \
+	          -I$(ROCMDIR)/hiprand/include \
+	          -I$(ROCMDIR)/rocrand/include \
+	          -I$(ROCMDIR)/include \
+	          -I.. -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
 
-ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC)
+ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
+               -I$(ROCMDIR)/hipfft/include \
+               -I$(ROCMDIR)/hipblas/include \
+               -I$(ROCMDIR)/hiprand/include \
+               -I$(ROCMDIR)/rocrand/include \
+               -I$(ROCMDIR)/include \
+               -I.. -I../hip -isystem $(OPENFSTINC)
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
@@ -25,4 +37,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
 
 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64

From aed0ce594e72bc935ab1f2fade0f26aa5229a3b9 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 13 Sep 2022 11:44:33 -0500
Subject: [PATCH 14/76] Complete support for ROCm 5.0.2.

---
 src/chain/Makefile                            |  2 +-
 src/cudadecoder/Makefile                      |  2 +-
 src/cudafeat/Makefile                         |  2 +-
 .../feature-online-batched-ivector-cuda.cc    | 41 +++++++++++++++++--
 .../feature-online-batched-spectral-cuda.h    |  4 ++
 src/cudafeat/feature-spectral-cuda.h          |  4 ++
 src/cudafeat/online-ivector-feature-cuda.cc   | 17 ++++++++
 src/cudamatrix/Makefile                       |  2 +-
 src/cudamatrix/cu-allocator.cc                |  4 ++
 src/cudamatrix/cu-allocator.h                 |  4 ++
 src/cudamatrix/cu-block-matrix.cc             |  4 ++
 src/cudamatrix/cu-common.h                    |  7 +++-
 src/cudamatrix/cu-compressed-matrix.cc        |  4 ++
 src/cudamatrix/cu-device.cc                   |  5 ++-
 src/cudamatrix/cu-device.h                    | 11 ++++-
 src/cudamatrix/cu-matrix.cc                   |  4 ++
 src/cudamatrix/cu-packed-matrix.cc            |  4 ++
 src/cudamatrix/cu-sp-matrix.cc                |  4 ++
 src/cudamatrix/cu-sparse-matrix.cc            |  4 ++
 src/cudamatrix/cu-tp-matrix.cc                |  4 ++
 src/cudamatrix/cu-vector.cc                   |  4 ++
 src/hip/hipify.h                              | 16 +++++---
 22 files changed, 138 insertions(+), 15 deletions(-)

diff --git a/src/chain/Makefile b/src/chain/Makefile
index 5cc8d8901a1..5b177981ad8 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -36,7 +36,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index 062e9a47d41..d4eda345564 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -44,7 +44,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index c3a4489e18e..c0f54a854e8 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -47,7 +47,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 6d68c93f917..68c247b43e9 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -17,9 +17,6 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include "hipify.h"
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
-#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
 // The BLAS enumerators are used instead of the SOLVER ones.
 #ifdef CUBLAS_FILL_MODE_LOWER
 #undef CUBLAS_FILL_MODE_LOWER
@@ -385,6 +382,43 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
 
 #if CUDA_VERSION >= 9010
   int nrhs = 1;
+
+#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
+  // query temp buffer size
+  int L_work;
+
+  // perform factorization in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize(
+        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
+        ivector_dim_, &L_work, num_lanes));
+  // allocate temp buffer
+  float *workspace = static_cast<float *>(
+          CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
+
+  // perform factorization in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched(
+        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
+        ivector_dim_, workspace, L_work, d_infoArray_, num_lanes));
+
+  int L_work2;
+
+  // perform factorization in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize(
+		  GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
+		  quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes));
+  // allocate temp buffer
+  float *workspace2 = static_cast<float *>(
+            CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
+
+  // solve for rhs in batched
+  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
+      quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_,
+      num_lanes));
+
+  CuDevice::Instantiate().Free(workspace);
+  CuDevice::Instantiate().Free(workspace2);
+#else
   // perform factorization in batched
   CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
@@ -395,6 +429,7 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
       quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_,
       num_lanes));
+#endif
 #endif
 
   // cusolver solves in place.  Ivectors are now in linear_
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index 113657ce317..202232c6b23 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -20,7 +20,11 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipfft.h>
+#else
 #include <hipfft/hipfft.h>
+#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 5625592a717..66f0dce395a 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -20,7 +20,11 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipfft.h>
+#else
 #include <hipfft/hipfft.h>
+#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index c3b15d72a5b..56dbac93165 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -317,9 +317,26 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
       A.Stride(), workspace, L_work, d_info_));
 
   // solve for rhs
+#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
+  // query temp buffer size
+  int L_work2;
+  CUSOLVER_SAFE_CALL(
+	   hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
+			                      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2));
+  // allocate temp buffer
+  float *workspace2 = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
+
+  CUSOLVER_SAFE_CALL(hipsolverSpotrs(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
+      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_));
+
+  CuDevice::Instantiate().Free(workspace2);
+#else
   CUSOLVER_SAFE_CALL(cusolverDnSpotrs(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
       A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_));
+#endif
 
   CuDevice::Instantiate().Free(workspace);
 #else
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 5cd4adcffd8..3c1100753e5 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -37,7 +37,7 @@ ifeq ($(ROCM), true)
 ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
 .PRECIOUS: %.hip
 %.hip : %.cu
-	LA='[^\(]+\([^\)]+\)|[^,]+' ; \
+	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
 	cat $< | \
 	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
 	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index abd08a9b015..d81dca002ce 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,7 +25,11 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 1ed7e54b541..f776bbb620e 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,7 +24,11 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fd17fe61893..7983cd250e7 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index da7c57bde36..c4bdf569d3c 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,10 +32,15 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#include <hipsparse.h>
+#else
 #include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+#endif
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
 #include "hipify.h"
 #else
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index e42c93f1b67..442d2dbac67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 705bfbeee59..3dada172ba8 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -21,10 +21,13 @@
 // limitations under the License.
 
 
-
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index d7edf5a5a1c..67b9f1d9e9b 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,16 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#include <hipsparse.h>
+#else
 #include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -44,7 +49,11 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipsolver.h>
+#else
 #include <hipsolver/hipsolver.h>
+#endif
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c1d72ede87e..9897917a33f 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,7 +29,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c9d686d0ce8..4de0fcba63d 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,7 +23,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a6c7d7720e4..86a3cd9a726 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index a21e5163701..93d10099466 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,7 +24,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 378cc8e4e38..739bab3dd59 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,7 +21,11 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index cf13d631a0d..1deb1cb8733 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,7 +24,11 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
+#include <hipblas.h>
+#else
 #include <hipblas/hipblas.h>
+#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 24b5f2f8eb3..b631ac08a23 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -191,17 +191,22 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cusolverDnDestroy                         hipsolverDnDestroy
 #define cusolverDnHandle_t                        hipsolverDnHandle_t
 #define cusolverDnSetStream                       hipsolverDnSetStream
+#define cusolverDnSpotrf                          hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs                          hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
 #else
 #define cusolverDnCreate                          hipsolverCreate
 #define cusolverDnDestroy                         hipsolverDestroy
 #define cusolverDnHandle_t                        hipsolverHandle_t
 #define cusolverDnSetStream                       hipsolverSetStream
+#define cusolverDnSpotrf                          hipsolverSpotrf
+#define cusolverDnSpotrfBatched                   hipsolverSpotrfBatched
+#define cusolverDnSpotrf_bufferSize               hipsolverSpotrf_bufferSize
+#define cusolverDnSpotrs                          hipsolverSpotrs
+#define cusolverDnSpotrsBatched                   hipsolverSpotrsBatched
 #endif
-#define cusolverDnSpotrf                          hipsolverDnSpotrf
-#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
-#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
-#define cusolverDnSpotrs                          hipsolverDnSpotrs
-#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
@@ -235,3 +240,4 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 
 
 #endif //__HIPIFY_H__
+

From 99101e8de70f17e670266f578638fe14e7785dce Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Tue, 6 Dec 2022 19:31:39 +0300
Subject: [PATCH 15/76] Do not use ADL to invoke std::binary_search

---
 src/tree/build-tree.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index 534f3352def..9726b5343ee 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -675,7 +675,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
 
   for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {  // A check.
     if (summed_stats[i] != NULL &&
-        !binary_search(phones.begin(), phones.end(), i)) {
+        !std::binary_search(phones.begin(), phones.end(), i)) {
       KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
     }
   }
@@ -795,7 +795,7 @@ void KMeansClusterPhones(BuildTreeStatsType &stats,
   for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {
     // just a check.
     if (summed_stats[i] != NULL &&
-        !binary_search(phones.begin(), phones.end(), i)) {
+        !std::binary_search(phones.begin(), phones.end(), i)) {
       KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
     }
   }

From a023f3fe56c0c43f7be5a6086f0cc1067aeebeae Mon Sep 17 00:00:00 2001
From: daanzu <daanzu@gmail.com>
Date: Sun, 11 Dec 2022 06:54:17 -0500
Subject: [PATCH 16/76] SRILM: allow bypassing download/extraction during
 automated installation

---
 tools/extras/install_srilm.sh | 47 +++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 813109dbb80..fa4b7b7ed80 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -16,30 +16,41 @@ fi
 ! command -v gawk > /dev/null && \
    echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
 
-if [ $# -ne 3 ]; then
-    echo "SRILM download requires some information about you"
-    echo
-    echo "Usage: $0 <name> <organization> <email>"
-    exit 1
-fi
-
-srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
-post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
-
-if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
-    echo 'There was a problem downloading the file.'
-    echo 'Check you internet connection and try again.'
-    exit 1
+if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ] && [ ! -d srilm ]; then
+  if [ $# -ne 3 ]; then
+      echo "SRILM download requires some information about you"
+      echo
+      echo "Usage: $0 <name> <organization> <email>"
+      exit 1
+  fi
+
+  srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
+  post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
+
+  if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
+      echo 'There was a problem downloading the file.'
+      echo 'Check your internet connection and try again.'
+      exit 1
+  fi
+
+  if [ ! -s srilm.tar.gz ]; then
+      echo 'The file is empty. There was a problem downloading the file.'
+      exit 1
+  fi
 fi
 
 mkdir -p srilm
 cd srilm
 
-
 if [ -f ../srilm.tgz ]; then
-    tar -xvzf ../srilm.tgz # Old SRILM format
-elif [  -f ../srilm.tar.gz ]; then
-    tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
+    tar -xvzf ../srilm.tgz || exit 1 # Old SRILM format
+elif [ -f ../srilm.tar.gz ]; then
+    tar -xvzf ../srilm.tar.gz || exit 1 # Changed format type from tgz to tar.gz
+fi
+
+if [ ! -f RELEASE ]; then
+    echo 'The file RELEASE does not exist. There was a problem extracting.'
+    exit 1
 fi
 
 major=`gawk -F. '{ print $1 }' RELEASE`

From be22248e3a166d9ec52c78dac945f471e7c3a8aa Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Tue, 13 Dec 2022 02:29:16 -0800
Subject: [PATCH 17/76] [src] Make word alignment optional (#4802)

* Remove unused variable.

* cudadecoder: Make word alignment optional.

For CTC models using word pieces or graphemes, there is not enough
positional information to use the word alignment.

I tried marking every unit as "singleton" word_boundary.txt, but this
explodes the state space very, very often. See:

https://github.com/nvidia-riva/riva-asrlib-decoder/issues/3

With the "_" character in CTC models predicting word pieces, we at the
very least know which word pieces begin a word and which ones are
either in the middle of the word or the end of a word, but the
algorithm would still need to be rewritten, especially since "blank"
is not a silence phoneme (it can appear between).

I did look into using the lexicon-based word alignment. I don't have a
specific complaint about it, but I did get a weird error where it
couldn't create a final state at all in the output lattice, which
caused Connect() to output an empty lattice. This may be because I
wasn't quite sure how to handle the blank token. I treat it as its own
phoneme, bcause of limitations in TransitionInformation, but this
doesn't really make any sense.

Needless to say, while the CTM outputs of the cuda decoder will be
correct from a WER point of view, their time stamps won't be correct,
but they probably never were in the first place, for CTC models.
---
 src/cudadecoder/lattice-postprocessor.cc | 15 ++++++++-------
 src/fstext/pre-determinize-inl.h         |  2 --
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/cudadecoder/lattice-postprocessor.cc b/src/cudadecoder/lattice-postprocessor.cc
index 46d44216890..49f96191787 100644
--- a/src/cudadecoder/lattice-postprocessor.cc
+++ b/src/cudadecoder/lattice-postprocessor.cc
@@ -78,13 +78,14 @@ bool LatticePostprocessor::GetPostprocessedLattice(
   KALDI_ASSERT(decoder_frame_shift_ != 0.0 &&
                "SetDecoderFrameShift() must be called (typically by pipeline)");
 
-  if (!word_info_)
-    KALDI_ERR << "You must set --word-boundary-rxfilename in the lattice "
-                 "postprocessor config";
-  // ok &=
-  // Ignoring the return false for now (but will print a warning),
-  // because the doc says we can, and it can happen when using endpointing
-  WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat);
+  if (word_info_) {
+    // ok &=
+    // Ignoring the return false for now (but will print a warning),
+    // because the doc says we can, and it can happen when using endpointing
+    WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat);
+  } else {
+    *out_clat = clat;
+  }
   return ok;
 }
 
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index d51948f1877..b67b0ba6fa6 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -689,11 +689,9 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
   typedef typename Arc::Weight Weight;
   assert(fst != NULL);
   StateId num_states = fst->NumStates();
-  StateId num_final = 0;
   std::vector<StateId> final_states;
   for (StateId s = 0; s < num_states; s++) {
     if (fst->Final(s) != Weight::Zero()) {
-      num_final++;
       final_states.push_back(s);
     }
   }

From aa17817f53ea44e44275bc494e747baaccc2e4d2 Mon Sep 17 00:00:00 2001
From: Tanmay Jain <csetanmayjain@gmail.com>
Date: Mon, 26 Dec 2022 20:37:51 +0530
Subject: [PATCH 18/76] Fix variable name (#4815)

Fix "glossaries_opt" variable name at line number 39. It's misspelled due to which words in the glossaries weren't reserved while creating BPE.
---
 egs/wsj/s5/utils/subword/prepare_subword_text.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
index aa0163235a6..2a5750c9238 100755
--- a/egs/wsj/s5/utils/subword/prepare_subword_text.sh
+++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -36,7 +36,7 @@ grep -q $separator $word_text && echo "$0: Error, word text file contains separa
 glossaries_opt=
 [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
 cut -d ' ' -f2- $word_text | \
-  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaries_opt > ${word_text}.sub
   if [ $word_text == $subword_text ]; then
     mv $word_text ${word_text}.old
     cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text

From 0785b66521d9732a0b2916e601830d751089f360 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dgalvez@nvidia.com>
Date: Thu, 5 Jan 2023 10:11:12 -0800
Subject: [PATCH 19/76] Add support for CUDA 12 and Hopper.

---
 src/configure | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index ed627eceedc..95338ea1bd0 100755
--- a/src/configure
+++ b/src/configure
@@ -283,6 +283,7 @@ Either your CUDA is too new or too old."
       GCC_VER=$($CXX -dumpversion)
       GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
       case $CUDA_VERSION in
+        # Update this list by consulting https://gist.github.com/ax3l/9489132
         # Disabling CUDA 7 and CUDA 8 because we now use C++14 to compile CUDA
         # code. It is still possible to use those cuda versions by switching
         # back to C++11 in src/makefiles/cuda_64bit.mk and use CUB <= 1.8.0.
@@ -317,7 +318,13 @@ Either your CUDA is too new or too old."
         11_*)
           MIN_UNSUPPORTED_GCC_VER="12.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=120000
-          ;;
+          CUSOLVER=true
+        ;;
+        12_*)
+          MIN_UNSUPPORTED_GCC_VER="12.2"
+          MIN_UNSUPPORTED_GCC_VER_NUM=122000
+          CUSOLVER=true
+        ;;
         *)
           failure "Unsupported CUDA version ${CUDA_VERSION}.
 Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
@@ -345,6 +352,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
             10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;;
             11_0) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80" ;;
             11_*) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" ;;
+            12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" ;;
             *) failure \
                  "Unsupported CUDA version ${CUDA_VERSION}. Please open an" \
                  "issue at https://github.com/kaldi-asr/kaldi/issues and" \

From ae8cbe8858f2a66a9b193c82dbe3b0479364165f Mon Sep 17 00:00:00 2001
From: Daniel Galvez <dgalvez@nvidia.com>
Date: Tue, 13 Dec 2022 11:03:29 -0800
Subject: [PATCH 20/76] [misc] Install python2.7

This is to fix a CI error.

It appears that this is from using "ubuntu-latest" in the CI
workflow. It got upgraded to ubuntu 22.04 automatically, and this
doesn't have python2.7 by default.
---
 .github/workflows/c-cpp.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index c1f923cf58a..8a21c82ea8f 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -19,6 +19,8 @@ jobs:
     - uses: actions/checkout@v3
     - name: Install sox
       run: sudo apt-get install -y sox intel-mkl
+    - name: Install python2
+      run: sudo apt-get install -y python2
     - name: ccache
       uses: hendrikmuhs/ccache-action@v1.2
       with:

From 8c3c0bca5dfd4dcb45174b0d2744deb246552b2a Mon Sep 17 00:00:00 2001
From: Neimhin <nrobinso@tcd.ie>
Date: Fri, 20 Jan 2023 14:34:10 +0000
Subject: [PATCH 21/76] Update install_srilm.sh

---
 tools/extras/install_srilm.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 813109dbb80..9f305c9310f 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -16,15 +16,15 @@ fi
 ! command -v gawk > /dev/null && \
    echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
 
-if [ $# -ne 3 ]; then
+if [ $# -ne 4 ]; then
     echo "SRILM download requires some information about you"
     echo
-    echo "Usage: $0 <name> <organization> <email>"
+    echo "Usage: $0 <name> <organization> <email> <address>"
     exit 1
 fi
 
-srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
-post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
+srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download2.php"
+post_data="file=1.7.3&name=$1&org=$2&email=$3&address=$4&license=on"
 
 if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
     echo 'There was a problem downloading the file.'

From e4eb4f6725d836b7915230f54adedfb605379254 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Fri, 3 Feb 2023 21:19:56 +0100
Subject: [PATCH 22/76] egs/ami: Fix BUT path to wavs in AMI scripts, add
 beamformer config (#4820)

- the audio data no longer exist in that path
- the beamformer config was missing in 'ami/s5b', it's taken from 'ami/s5'
---
 egs/ami/s5/run_ihm.sh               |  2 +-
 egs/ami/s5/run_mdm.sh               |  2 +-
 egs/ami/s5/run_sdm.sh               |  2 +-
 egs/ami/s5b/cmd.sh                  |  2 +-
 egs/ami/s5b/conf/ami_beamformit.cfg | 50 +++++++++++++++++++++++++++++
 egs/ami/s5b/run.sh                  |  2 +-
 egs/ami/s5c/run.sh                  |  8 ++---
 7 files changed, 59 insertions(+), 9 deletions(-)
 create mode 100644 egs/ami/s5b/conf/ami_beamformit.cfg

diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
index 0d40d25c23a..ed91a980791 100755
--- a/egs/ami/s5/run_ihm.sh
+++ b/egs/ami/s5/run_ihm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh
index 4389c6b5d81..0cc76a56dd0 100755
--- a/egs/ami/s5/run_mdm.sh
+++ b/egs/ami/s5/run_mdm.sh
@@ -10,7 +10,7 @@ mic=mdm$nmics
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh
index 17e2071f1f6..a212a8846b2 100755
--- a/egs/ami/s5/run_sdm.sh
+++ b/egs/ami/s5/run_sdm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5b/cmd.sh b/egs/ami/s5b/cmd.sh
index b004c5569df..a8ea5d7c1ba 100644
--- a/egs/ami/s5b/cmd.sh
+++ b/egs/ami/s5b/cmd.sh
@@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G"
 # the use of cuda_cmd is deprecated, used only in 'nnet1',
 export cuda_cmd="queue.pl --gpu 1 --mem 20G"
 
-if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
+if [[ "$(hostname -d)" == "fit.vutbr.cz" ]]; then
   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
diff --git a/egs/ami/s5b/conf/ami_beamformit.cfg b/egs/ami/s5b/conf/ami_beamformit.cfg
new file mode 100644
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/ami/s5b/conf/ami_beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh
index 79989f17004..94cd81f230b 100755
--- a/egs/ami/s5b/run.sh
+++ b/egs/ami/s5b/run.sh
@@ -28,7 +28,7 @@ set -euo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5c/run.sh b/egs/ami/s5c/run.sh
index cc4cd87610b..1281cad2e43 100755
--- a/egs/ami/s5c/run.sh
+++ b/egs/ami/s5c/run.sh
@@ -3,7 +3,7 @@
 # Apache 2.0.
 #
 # This recipe performs diarization for the mix-headset data in the
-# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 
+# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2
 # corpus with simulated RIRs. We use oracle SAD in this recipe.
 # This recipe demonstrates the following:
 # 1. Diarization using x-vector and clustering (AHC, VBx, spectral)
@@ -38,7 +38,7 @@ diarizer_type=spectral  # must be one of (ahc, spectral, vbx)
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora5/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
@@ -57,7 +57,7 @@ if [ $stage -le 1 ]; then
   local/ami_download.sh $mic $AMI_DIR
 fi
 
-# Prepare data directories. 
+# Prepare data directories.
 if [ $stage -le 2 ]; then
   # Download the data split and references from BUT's AMI setup
   if ! [ -d AMI-diarization-setup ]; then
@@ -120,7 +120,7 @@ if [ $stage -le 6 ]; then
      transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- |\
       ivector-normalize-length ark:-  ark:- |" \
     $model_dir/xvectors_plda_train/plda || exit 1;
-  
+
   cp $model_dir/xvectors_plda_train/plda $model_dir/
   cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
   cp $model_dir/xvectors_plda_train/mean.vec $model_dir/

From ed910d6090e48417a90084d7161023f429fa4e1e Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Sat, 18 Feb 2023 14:15:56 +0300
Subject: [PATCH 23/76] Fix -Wdeprecated-copy from c++11

---
 src/fstext/lattice-weight.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 7637c4d1c55..6e7737a195d 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -438,11 +438,9 @@ class CompactLatticeWeightTpl {
   CompactLatticeWeightTpl(const WeightType &w, const std::vector<IntType> &s):
       weight_(w), string_(s) { }
 
-  CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl<WeightType, IntType> &w) {
-    weight_ = w.weight_;
-    string_ = w.string_;
-    return *this;
-  }
+  CompactLatticeWeightTpl(const CompactLatticeWeightTpl &compactLatticeWeightTpl) = default;
+
+  CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) = default;
 
   const W &Weight() const { return weight_; }
 

From 59299d1cf95b72bb109d583947d9e9ece19aa6dc Mon Sep 17 00:00:00 2001
From: Yifan Yang <64255737+yfyeung@users.noreply.github.com>
Date: Mon, 20 Feb 2023 11:36:50 +0800
Subject: [PATCH 24/76] Fix for issue#4801 (#4826)

---
 cmake/gen_cmake_skeleton.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py
index 5925c6369a8..c8fee4c415f 100644
--- a/cmake/gen_cmake_skeleton.py
+++ b/cmake/gen_cmake_skeleton.py
@@ -269,7 +269,7 @@ def gen_code(self):
 
         if len(self.depends) > 0:
             ret.append("target_link_libraries(" + self.target_name + " PUBLIC")
-            for d in self.depends:
+            for d in self.depends + ['-lcblas', '-llapack']:
                 ret.append("    " + d)
             ret.append(")\n")
 

From ab8fa9e46182c6550d115fb10c7032fedfd6e01a Mon Sep 17 00:00:00 2001
From: "Nickolay V. Shmyrev" <nshmyrev@gmail.com>
Date: Mon, 17 Apr 2023 21:58:12 +0300
Subject: [PATCH 25/76] No need for atomicAdd for float2, conflicts with CUDA
 12.1 (#4838)

function is not used anyway
---
 src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index c839548d6eb..d803a915ea0 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -24,6 +24,7 @@ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   retval.y = a.y - b.y;
   return retval;
 }
+
 __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   float2 retval;
   retval.x = a.x + b.x;
@@ -31,11 +32,6 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   return retval;
 }
 
-__device__ inline void atomicAdd(float2 *addr, float2 val) {
-  atomicAdd(reinterpret_cast<float *>(addr), val.x);
-  atomicAdd(reinterpret_cast<float *>(addr) + 1, val.y);
-}
-
 __device__ inline void operator+=(float2 &a, float2 &b) {
   // overloading +=
   a.x += b.x;

From 9a8588ac111e691a74bb5d98a6b11f699984f910 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Wed, 26 Apr 2023 11:26:01 +0300
Subject: [PATCH 26/76] More fixes of unwanted ADL usage of std algos (#4828)

This continues the work started in #4809.
---
 src/fstext/determinize-star-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index e9650ca29a6..36c9ba397a6 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -725,7 +725,7 @@ void DeterminizerStar<F>::EpsilonClosure::
 
   {
     // this sorting is based on StateId
-    sort(ecinfo_.begin(), ecinfo_.end());
+    std::sort(ecinfo_.begin(), ecinfo_.end());
 
     output_subset->clear();
 

From 19185083f4ce3f74d7b2fc7494b8ea530feeab01 Mon Sep 17 00:00:00 2001
From: "Nickolay V. Shmyrev" <nshmyrev@gmail.com>
Date: Wed, 26 Apr 2023 11:27:05 +0300
Subject: [PATCH 27/76] Fix matrix data offset for large matrices (#4823)

* Fix matrix data offset for large matrices

* Fix overflow in cudamatrix too
---
 src/cudamatrix/cu-matrix.h | 14 +++++++-------
 src/matrix/kaldi-matrix.h  | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index a531ecd45b9..3ffe67d8b06 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -231,7 +231,7 @@ class CuMatrixBase {
   bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
 
   /// Get size of matrix in bytes
-  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
+  size_t SizeInBytes() const { return static_cast<size_t>(num_rows_)*static_cast<size_t>(stride_)*sizeof(Real); }
 
   // Copy functions.  These do not resize.
   template<typename OtherReal>
@@ -670,13 +670,13 @@ class CuMatrixBase {
   inline const CuSubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+    return CuSubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   inline CuSubVector<Real> Row(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+    return CuSubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
@@ -684,7 +684,7 @@ class CuMatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return CuValue<Real>(data_ + r * stride_ + c);
+    return CuValue<Real>(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);
   }
 
   inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
@@ -692,7 +692,7 @@ class CuMatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return CuValue<Real>(data_ + r * stride_ + c);  // will be casted to Real.
+    return CuValue<Real>(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);  // will be casted to Real.
   }
 
   Real Sum() const;
@@ -737,10 +737,10 @@ class CuMatrixBase {
 
   /// Get raw row pointer (const).  Warning: may return a pointer to GPU memory.  Use at
   /// your own risk.
-  inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; }
+  inline const Real* RowData(MatrixIndexT r) const { return data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_); }
   /// Get raw row pointer.  Warning: may return a pointer to GPU memory.  Use at
   /// your own risk.
-  inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; }
+  inline Real* RowData(MatrixIndexT r) { return data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_); }
   /// Return data pointer (const).  Warning: may return a pointer to GPU memory.
   /// Use at your own risk.
   inline const Real *Data() const { return data_; }
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index bc95c9189f6..064edf4237b 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -87,14 +87,14 @@ class MatrixBase {
   inline  Real* RowData(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
+    return data_ + static_cast<size_t>(i) * static_cast<size_t>(stride_);
   }
 
   /// Returns pointer to data for one row (const)
   inline const Real* RowData(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
+    return data_ + static_cast<size_t>(i) * static_cast<size_t>(stride_);
   }
 
   /// Indexing operator, non-const
@@ -104,7 +104,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
+    return *(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);
   }
   /// Indexing operator, provided for ease of debugging (gdb doesn't work
   /// with parenthesis operator).
@@ -117,7 +117,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
+    return *(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);
   }
 
   /*   Basic setting-to-special values functions. */
@@ -188,14 +188,14 @@ class MatrixBase {
   inline const SubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   /// Return specific row of matrix.
   inline SubVector<Real> Row(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   /// Return a sub-part of matrix.

From 40fa1487f89a076cca75178bd2ddd73edb07dff9 Mon Sep 17 00:00:00 2001
From: sendream <1149593720@qq.com>
Date: Thu, 27 Apr 2023 15:27:20 +0800
Subject: [PATCH 28/76] Add recipe of Tibetan Amdo dialect

---
 egs/xbmu_amdo31/README.txt                    |  11 +
 egs/xbmu_amdo31/s5/RESULTS                    |   8 +
 egs/xbmu_amdo31/s5/cmd.sh                     |  15 ++
 egs/xbmu_amdo31/s5/conf/decode.config         |   5 +
 egs/xbmu_amdo31/s5/conf/mfcc.conf             |   2 +
 egs/xbmu_amdo31/s5/conf/mfcc_hires.conf       |  10 +
 egs/xbmu_amdo31/s5/conf/online_cmvn.conf      |   1 +
 egs/xbmu_amdo31/s5/conf/online_pitch.conf     |   4 +
 egs/xbmu_amdo31/s5/conf/pitch.conf            |   1 +
 egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh    |   1 +
 .../s5/local/chain/tuning/run_tdnn_1a.sh      | 184 +++++++++++++++
 .../s5/local/chain/tuning/run_tdnn_2a.sh      | 211 ++++++++++++++++++
 .../s5/local/download_and_untar.sh            | 105 +++++++++
 .../s5/local/nnet3/run_ivector_common.sh      | 159 +++++++++++++
 egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh    |   1 +
 .../s5/local/nnet3/tuning/run_tdnn_1a.sh      | 117 ++++++++++
 .../s5/local/nnet3/tuning/run_tdnn_2a.sh      | 145 ++++++++++++
 egs/xbmu_amdo31/s5/local/score.sh             |   8 +
 egs/xbmu_amdo31/s5/local/wer_hyp_filter       |  19 ++
 egs/xbmu_amdo31/s5/local/wer_output_filter    |  25 +++
 egs/xbmu_amdo31/s5/local/wer_ref_filter       |  19 ++
 .../s5/local/xbmu_amdo31_data_prep.sh         |  77 +++++++
 .../s5/local/xbmu_amdo31_prepare_dict.sh      |  36 +++
 .../s5/local/xbmu_amdo31_train_lms.sh         |  88 ++++++++
 egs/xbmu_amdo31/s5/path.sh                    |   6 +
 egs/xbmu_amdo31/s5/run.sh                     | 156 +++++++++++++
 egs/xbmu_amdo31/s5/steps                      |   1 +
 egs/xbmu_amdo31/s5/utils                      |   1 +
 28 files changed, 1416 insertions(+)
 create mode 100644 egs/xbmu_amdo31/README.txt
 create mode 100644 egs/xbmu_amdo31/s5/RESULTS
 create mode 100644 egs/xbmu_amdo31/s5/cmd.sh
 create mode 100644 egs/xbmu_amdo31/s5/conf/decode.config
 create mode 100644 egs/xbmu_amdo31/s5/conf/mfcc.conf
 create mode 100644 egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
 create mode 100644 egs/xbmu_amdo31/s5/conf/online_cmvn.conf
 create mode 100644 egs/xbmu_amdo31/s5/conf/online_pitch.conf
 create mode 100644 egs/xbmu_amdo31/s5/conf/pitch.conf
 create mode 120000 egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/download_and_untar.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
 create mode 120000 egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/score.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/wer_hyp_filter
 create mode 100755 egs/xbmu_amdo31/s5/local/wer_output_filter
 create mode 100755 egs/xbmu_amdo31/s5/local/wer_ref_filter
 create mode 100755 egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
 create mode 100755 egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
 create mode 100755 egs/xbmu_amdo31/s5/path.sh
 create mode 100755 egs/xbmu_amdo31/s5/run.sh
 create mode 120000 egs/xbmu_amdo31/s5/steps
 create mode 120000 egs/xbmu_amdo31/s5/utils

diff --git a/egs/xbmu_amdo31/README.txt b/egs/xbmu_amdo31/README.txt
new file mode 100644
index 00000000000..d2cda16fa58
--- /dev/null
+++ b/egs/xbmu_amdo31/README.txt
@@ -0,0 +1,11 @@
+About the XBMU-AMDO31 corpus XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University. 
+
+XBMU-AMDO31 dataset is a speech recognition corpus of Tibetan Amdo dialect. The open source corpus contains 31 hours of speech data and resources related to build speech recognition systems,including transcribed texts and a Tibetan pronunciation lexicon. (The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused for the Amdo dialect because of the uniformity of the Tibetan language) The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
+
+The database can be downloaded from openslr:
+http://www.openslr.org/133/
+
+For more details, please visit: 
+https://huggingface.co/datasets/syzym/xbmu_amdo31
+
+This recipe includes some different ASR models trained with XBMU-AMDO31.
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/RESULTS b/egs/xbmu_amdo31/s5/RESULTS
new file mode 100644
index 00000000000..e50e43dc4db
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/RESULTS
@@ -0,0 +1,8 @@
+%WER 46.16 [ 15522 / 33628, 380 ins, 2208 del, 12934 sub ] exp/mono/decode_test/wer_10_0.0
+%WER 24.60 [ 8274 / 33628, 330 ins, 860 del, 7084 sub ] exp/tri1/decode_test/wer_13_0.0
+%WER 24.42 [ 8213 / 33628, 323 ins, 847 del, 7043 sub ] exp/tri2/decode_test/wer_13_0.0
+%WER 22.93 [ 7712 / 33628, 336 ins, 814 del, 6562 sub ] exp/tri3a/decode_test/wer_12_0.0
+%WER 20.17 [ 6783 / 33628, 275 ins, 764 del, 5744 sub ] exp/tri4a/decode_test/wer_15_0.0
+%WER 19.03 [ 6400 / 33628, 292 ins, 667 del, 5441 sub ] exp/tri5a/decode_test/wer_14_0.0
+%WER 15.45 [ 5196 / 33628, 229 ins, 646 del, 4321 sub ] exp/nnet3/tdnn_sp/decode_test/wer_16_0.0
+%WER 15.57 [ 5235 / 33628, 244 ins, 575 del, 4416 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_11_0.0
diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh
new file mode 100644
index 00000000000..1ba3f789bf8
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd=run.pl
+export decode_cmd=run.pl
+export mkgraph_cmd=run.pl
diff --git a/egs/xbmu_amdo31/s5/conf/decode.config b/egs/xbmu_amdo31/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc.conf b/egs/xbmu_amdo31/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/xbmu_amdo31/s5/conf/online_pitch.conf b/egs/xbmu_amdo31/s5/conf/online_pitch.conf
new file mode 100644
index 00000000000..c0f1342160d
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/xbmu_amdo31/s5/conf/pitch.conf b/egs/xbmu_amdo31/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..0c7ddcfe471
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_1a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=1
+num_jobs_final=2
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+#local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 5 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
new file mode 100755
index 00000000000..669a014e8cf
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -0,0 +1,211 @@
+#!/usr/bin/env bash
+
+# This script is based on run_tdnn_1a.sh.
+# This setup used online pitch to train the neural network.
+# It requires a online_pitch.conf in the conf dir.
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_2a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_online \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+dir=${dir}_online
+if [ $stage -le 15 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 16 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" --per-utt true \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/xbmu_amdo31/s5/local/download_and_untar.sh b/egs/xbmu_amdo31/s5/local/download_and_untar.sh
new file mode 100755
index 00000000000..9c70836bf46
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..70d492b2774
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev test"
+gmm=tri5a
+online=false
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+online_affix=
+if [ $online = true ]; then
+  online_affix=_online
+fi
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires$online_affix
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 5 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+train_set=train_sp
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 5 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..b76fcfbbb18
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=1
+num_jobs_final=2
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
new file mode 100755
index 00000000000..a5b129be31c
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+
+# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
+
+# In this script, the neural network in trained based on hires mfcc and online pitch.
+# The online pitch setup requires a online_pitch.conf in the conf dir for both training
+# and testing.
+
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires_online \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # do the actual online decoding with iVectors, carrying info forward from
+  # previous utterances of the same speaker.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_$decode_set
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_${decode_set}_per_utt
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config --per-utt true \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/score.sh b/egs/xbmu_amdo31/s5/local/score.sh
new file mode 100755
index 00000000000..d283ceb68dc
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/xbmu_amdo31/s5/local/wer_hyp_filter b/egs/xbmu_amdo31/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..c6660e4efe1
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SPOKEN_NOISE>','<UNK>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/xbmu_amdo31/s5/local/wer_output_filter b/egs/xbmu_amdo31/s5/local/wer_output_filter
new file mode 100755
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/xbmu_amdo31/s5/local/wer_ref_filter b/egs/xbmu_amdo31/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..c6660e4efe1
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SPOKEN_NOISE>','<UNK>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
new file mode 100755
index 00000000000..5cda85774a7
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Xingyu Na
+#           2021 Northwest Minzu University (senyan Li)
+#Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/data/xbmu_amdo31/data/wav /export/data/xbmu_amdo31/data/transcript"
+  exit 1;
+fi
+
+tibetan_audio_dir=$1
+tibetan_text=$2/transcript_clean.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $tibetan_audio_dir ] || [ ! -f $tibetan_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+echo $tibetan_audio_dir
+# find wav audio file for train, dev and test resp.
+find $tibetan_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 22630 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+# Transcriptions preparation
+# cat $tibetan_text |head -10
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}'  > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all
+  rm -f $dir/transcripts1.txt
+  cat $dir/utt.list |while read line
+  do
+      line1=`echo $line |cut -d "-" -f 2`
+      line2=`grep -w $line1  $tibetan_text |cut -d " " -f 2-`
+      text=$line" "$line2
+      echo $text >>$dir/transcripts1.txt
+  done
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: tibetan data preparation succeeded"
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
new file mode 100755
index 00000000000..1e5537858ff
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+# prepare dict resources
+
+. ./path.sh
+
+[ $# != 1 ] && echo "Usage: $0 <resource-path>" && exit 1;
+
+res_dir=$1
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+cp $res_dir/lexicon.txt $dict_dir
+
+cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
+    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
+    foreach $l (values %q) {print "$l\n";}
+ ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+echo sil > $dict_dir/silence_phones.txt
+
+echo sil > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "$0: Tibetan dict preparation succeeded"
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
new file mode 100755
index 00000000000..eaca5e2fafa
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SPOKEN_NOISE>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<SPOKEN_NOISE>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/xbmu_amdo31/s5/run.sh b/egs/xbmu_amdo31/s5/run.sh
new file mode 100755
index 00000000000..61b3e8f62d8
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/run.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+
+# Copyright Copyright 2021 Northwest Minzu University (Authors: Senyan Li)
+#           2017 Hui Bu
+#           2017 Jiayu Du
+#           2017 Xingyu Na
+#           2017 Bengu Wu
+#           2017 Hao Zheng
+# Apache 2.0
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+# Caution: some of the graph creation steps use quite a bit of memory, so you
+# should run this on a machine that has sufficient memory.
+
+# corpus directory and download URL
+data=/home1/lsy/kaldi/egs/xbmu_amdo31/s5/export/data
+data_url=www.openslr.org/resources/133
+
+. ./cmd.sh
+
+#local/download_and_untar.sh $data $data_url xbmu-amdo31 || exit 1;
+
+# Lexicon Preparation,
+local/xbmu_amdo31_prepare_dict.sh $data/xbmu_amdo31/resource || exit 1;
+
+# Data Preparation,
+local/xbmu_amdo31_data_prep.sh $data/xbmu_amdo31/data/wav $data/xbmu_amdo31/data/transcript || exit 1;
+
+# Phone Sets, questions, L compilation
+utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
+    "<SPOKEN_NOISE>" data/local/lang data/lang || exit 1;
+
+# LM training
+local/xbmu_amdo31_train_lms.sh || exit 1;
+
+# G compilation, check LG composition
+utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+    data/local/dict/lexicon.txt data/lang_test || exit 1;
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+for x in train dev test; do
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  utils/fix_data_dir.sh data/$x || exit 1;
+done
+
+# Train a monophone model on delta features.
+steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono || exit 1;
+
+# Decode with the monophone model.
+utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/mono/graph data/dev exp/mono/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/mono/graph data/test exp/mono/decode_test
+
+# Get alignments from monophone system.
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+# Train the first triphone pass model tri1 on delta + delta-delta features.
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+# decode tri1
+utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri1/graph data/dev exp/tri1/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri1/graph data/test exp/tri1/decode_test
+
+# align tri1
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+# train tri2 [delta+delta-deltas]
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+# decode tri2
+utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri2/graph data/dev exp/tri2/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri2/graph data/test exp/tri2/decode_test
+
+# Align training data with the tri2 model.
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+# Train the second triphone pass model tri3a on LDA+MLLT features.
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+ 
+# Run a test decode with the tri3a model.
+utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri3a/graph data/dev exp/tri3a/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri3a/graph data/test exp/tri3a/decode_test
+
+# align tri3a with fMLLR
+
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+# Train the third triphone pass model tri4a on LDA+MLLT+SAT features.
+# From now on, we start building a more serious system with Speaker
+# Adaptive Training (SAT).
+steps/train_sat.sh --cmd "$train_cmd" \
+  2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+# decode tri4a
+utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri4a/graph data/dev exp/tri4a/decode_dev
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri4a/graph data/test exp/tri4a/decode_test
+  
+# align tri4a with fMLLR
+steps/align_fmllr.sh  --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri4a exp/tri4a_ali
+
+# Train tri5a, which is LDA+MLLT+SAT
+# Building a larger SAT system. You can see the num-leaves is 3500 and tot-gauss is 100000
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+  
+# decode tri5a
+utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+   exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+   exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+   
+# align tri5a with fMLLR
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+# nnet3
+local/nnet3/run_tdnn.sh
+
+# chain
+local/chain/run_tdnn.sh
+
+# getting results (see RESULTS file)
+for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+for x in exp/*/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/steps b/egs/xbmu_amdo31/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/utils b/egs/xbmu_amdo31/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From 3947a4be7815a968a1ffc0d3def8a7afd9949461 Mon Sep 17 00:00:00 2001
From: sendream <1149593720@qq.com>
Date: Thu, 27 Apr 2023 17:32:02 +0800
Subject: [PATCH 29/76] Modify code to conform to ShellCheck specifications

---
 egs/xbmu_amdo31/s5/cmd.sh                            |  6 +++---
 egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh |  2 +-
 egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh |  2 +-
 egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh |  2 +-
 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh |  2 +-
 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh |  8 +++++---
 egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh    | 10 +++++-----
 egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh    |  2 +-
 egs/xbmu_amdo31/s5/path.sh                           |  2 +-
 9 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh
index 1ba3f789bf8..71dd849a93b 100644
--- a/egs/xbmu_amdo31/s5/cmd.sh
+++ b/egs/xbmu_amdo31/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export mkgraph_cmd=run.pl
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
index 0c7ddcfe471..826aa163f2a 100755
--- a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -27,7 +27,7 @@ common_egs_dir=
 xent_regularize=0.1
 
 # End configuration section.
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $*"  # Print the command line for logging
 
 . ./cmd.sh
 . ./path.sh
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
index 669a014e8cf..52d56adbc60 100755
--- a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -29,7 +29,7 @@ common_egs_dir=
 xent_regularize=0.1
 
 # End configuration section.
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $*"  # Print the command line for logging
 
 . ./cmd.sh
 . ./path.sh
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
index 70d492b2774..610774fb2a2 100755
--- a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
@@ -90,7 +90,7 @@ if [ $stage -le 4 ]; then
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
   num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
-  num_utts=$[$num_utts_total/4]
+  num_utts=$((num_utts_total/4))
   utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
      $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
 
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
index b76fcfbbb18..f0cee7329ba 100755
--- a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -105,7 +105,7 @@ if [ $stage -le 9 ]; then
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
   for decode_set in dev test; do
-    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    num_jobs=$(cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l)
     decode_dir=${dir}/decode_$decode_set
     steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
index a5b129be31c..6936f389bbb 100755
--- a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
@@ -103,7 +103,7 @@ if [ $stage -le 9 ]; then
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
   for decode_set in dev test; do
-    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    num_jobs=$(cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l)
     decode_dir=${dir}/decode_$decode_set
     steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
@@ -121,7 +121,8 @@ if [ $stage -le 11 ]; then
   # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
-    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    num_jobs=$(cat "data/${decode_set}_hires_online/utt2spk" | cut -d' ' -f2 | sort -u | wc -l)
     decode_dir=${dir}_online/decode_$decode_set
     steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --config conf/decode.config \
@@ -133,7 +134,8 @@ if [ $stage -le 12 ]; then
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
   for decode_set in dev test; do
-    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    num_jobs=$(cat "data/${decode_set}_hires_online/utt2spk" | cut -d' ' -f2 | sort -u | wc -l)
     decode_dir=${dir}_online/decode_${decode_set}_per_utt
     steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --config conf/decode.config --per-utt true \
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
index 5cda85774a7..a3ba6fabaf4 100755
--- a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
@@ -33,7 +33,7 @@ fi
 echo $tibetan_audio_dir
 # find wav audio file for train, dev and test resp.
 find $tibetan_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
-n=`cat $tmp_dir/wav.flist | wc -l`
+n=$(wc -l < "$tmp_dir/wav.flist")
 [ $n -ne 22630 ] && \
   echo Warning: expected 141925 data data files, found $n
 
@@ -49,13 +49,13 @@ for dir in $train_dir $dev_dir $test_dir; do
   sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}'  > $dir/utt.list
   sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all
   rm -f $dir/transcripts1.txt
-  cat $dir/utt.list |while read line
+  while read -r line
   do
-      line1=`echo $line |cut -d "-" -f 2`
-      line2=`grep -w $line1  $tibetan_text |cut -d " " -f 2-`
+      line1=$(echo "$line" | cut -d '-' -f 2)
+      line2=$(grep -w $line1  $tibetan_text |cut -d " " -f 2-)
       text=$line" "$line2
       echo $text >>$dir/transcripts1.txt
-  done
+  done < "$dir/utt.list"
   paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
   utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt
   awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
index eaca5e2fafa..658f0e7bc15 100755
--- a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
@@ -19,7 +19,7 @@ done
 dir=data/local/lm
 mkdir -p $dir
 
-kaldi_lm=`which train_lm.sh`
+kaldi_lm=$(command -v train_lm.sh)
 if [ -z $kaldi_lm ]; then
   echo "$0: train_lm.sh is not found. That might mean it's not installed"
   echo "$0: or it is not added to PATH"
diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh
index 2d17b17a84a..b70ffbfbb26 100755
--- a/egs/xbmu_amdo31/s5/path.sh
+++ b/egs/xbmu_amdo31/s5/path.sh
@@ -1,4 +1,4 @@
-export KALDI_ROOT=`pwd`/../../..
+export KALDI_ROOT=$(pwd)/../../..
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1

From cbc728497deded941940666292c418490ac87bce Mon Sep 17 00:00:00 2001
From: sendream <1149593720@qq.com>
Date: Thu, 27 Apr 2023 17:46:53 +0800
Subject: [PATCH 30/76] Modify code to conform to ShellCheck specifications

---
 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
index 6936f389bbb..3f920315b77 100755
--- a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
@@ -122,7 +122,7 @@ if [ $stage -le 11 ]; then
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-    num_jobs=$(cat "data/${decode_set}_hires_online/utt2spk" | cut -d' ' -f2 | sort -u | wc -l)
+    num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l)
     decode_dir=${dir}_online/decode_$decode_set
     steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --config conf/decode.config \
@@ -135,7 +135,7 @@ if [ $stage -le 12 ]; then
   # without carrying forward speaker information.
   for decode_set in dev test; do
     # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-    num_jobs=$(cat "data/${decode_set}_hires_online/utt2spk" | cut -d' ' -f2 | sort -u | wc -l)
+    num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l)
     decode_dir=${dir}_online/decode_${decode_set}_per_utt
     steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
        --config conf/decode.config --per-utt true \

From 0d7f17f3303bdcd5bfab4bdd5714bbd26dd2631a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 2 May 2023 23:44:15 +0800
Subject: [PATCH 31/76] Fix download location in install_liblbfgs.sh

---
 tools/extras/install_liblbfgs.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 tools/extras/install_liblbfgs.sh

diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
old mode 100644
new mode 100755
index 10f72cad84f..8a726dd144d
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
+
 VER=1.10
-if [ ! -f liblbfgs-$VER.tar.gz ]; then
-  wget https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz
+if [ ! -f liblbfgs$VER.tar.gz ]; then
+  wget https://danielpovey.com/files/liblbfgs-1.10.tar.gz
+  ## wget https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz
 fi
 
 tar -xzf liblbfgs-$VER.tar.gz
@@ -29,4 +31,3 @@ cd ..
   echo "export LIBLBFGS=$wd/liblbfgs-1.10"
   echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
 ) >> env.sh
-

From 039ccbf26d241cf40d2e85a7bddadd97d06f5b5d Mon Sep 17 00:00:00 2001
From: Baffin Lee <baffinlee@gmail.com>
Date: Fri, 5 May 2023 11:47:26 +0000
Subject: [PATCH 32/76] [egs] convert tuple to NDArray before call
 sklearn.manifold.TSNE

---
 egs/gop_speechocean762/s5/local/visualize_feats.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/egs/gop_speechocean762/s5/local/visualize_feats.py b/egs/gop_speechocean762/s5/local/visualize_feats.py
index 3b3ddaa037a..202c6a57b6b 100644
--- a/egs/gop_speechocean762/s5/local/visualize_feats.py
+++ b/egs/gop_speechocean762/s5/local/visualize_feats.py
@@ -8,6 +8,7 @@
 import random
 import kaldi_io
 import seaborn as sns
+import numpy as np
 from collections import Counter
 from sklearn.manifold import TSNE
 from utils import load_human_scores, load_phone_symbol_table
@@ -62,6 +63,9 @@ def main():
                                   min(args.samples, len(lables)))
     features, lables = list(zip(*sampled_paris))
 
+    # Convert the tuple of arrays to a single 2D array
+    features = np.vstack(features)
+
     # Draw scatters
     label_counter = Counter(lables)
     colors = sns.color_palette("colorblind", len(label_counter))

From f5805db451000c705bcefe8ef01658fb979f5cce Mon Sep 17 00:00:00 2001
From: Stu Hilton <williamhilton.works@gmail.com>
Date: Fri, 11 Aug 2023 12:41:12 -0500
Subject: [PATCH 33/76] Create Dockerfile

Creates a Dockerfile in support of Ubuntu 22.04.
---
 docker/ubuntu22.04-cuda12.2.0/Dockerfile | 48 ++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 docker/ubuntu22.04-cuda12.2.0/Dockerfile

diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
new file mode 100644
index 00000000000..1d247399c75
--- /dev/null
+++ b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
@@ -0,0 +1,48 @@
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+LABEL maintainer="williamhilton.works@gmail.com"
+
+RUN apt update && \
+    apt install -y \
+        software-properties-common && \
+    apt-add-repository multiverse && \
+    apt update && \
+    apt install -y \
+        build-essential \
+        g++ \
+        make \
+        automake \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        gfortran \
+        patch \
+        ffmpeg \
+        vim \
+        python2-dev \
+        python3-dev && \
+    apt update && \
+    yes | DEBIAN_FRONTEND=noninteractive apt install -yqq \
+        intel-mkl && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared --use-cuda && \
+    make depend -j $(nproc) && \
+    make -j $(nproc) && \
+    find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    rm -rf /opt/kaldi/.git
+
+WORKDIR /opt/kaldi/

From 745c6e1f5c0b38c8bcbdfdd69c01b83ef7206e3d Mon Sep 17 00:00:00 2001
From: Stu Hilton <williamhilton.works@gmail.com>
Date: Fri, 11 Aug 2023 13:46:43 -0500
Subject: [PATCH 34/76] Update Dockerfile

Reduces image size and re-organizes install list for clarity.
---
 docker/ubuntu22.04-cuda12.2.0/Dockerfile | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
index 1d247399c75..6e6ea2e7ce0 100644
--- a/docker/ubuntu22.04-cuda12.2.0/Dockerfile
+++ b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
@@ -2,11 +2,7 @@ FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
 LABEL maintainer="williamhilton.works@gmail.com"
 
 RUN apt update && \
-    apt install -y \
-        software-properties-common && \
-    apt-add-repository multiverse && \
-    apt update && \
-    apt install -y \
+    apt install -y --no-install-recommends \
         build-essential \
         g++ \
         make \
@@ -25,11 +21,13 @@ RUN apt update && \
         gfortran \
         patch \
         ffmpeg \
-        vim \
-        python2-dev \
-        python3-dev && \
+        vim && \
+    apt update && \
+    apt install -y --no-install-recommends\
+        software-properties-common && \
+    apt-add-repository multiverse && \
     apt update && \
-    yes | DEBIAN_FRONTEND=noninteractive apt install -yqq \
+    yes | DEBIAN_FRONTEND=noninteractive apt install -yqq --no-install-recommends\
         intel-mkl && \
     rm -rf /var/lib/apt/lists/*
 

From ebf624594ea46f0872f5de201b3c217999f3d8fc Mon Sep 17 00:00:00 2001
From: Stu Hilton <williamhilton.works@gmail.com>
Date: Sat, 12 Aug 2023 09:31:36 -0500
Subject: [PATCH 35/76] Update Dockerfile

Using apt-get instead of apt
---
 docker/ubuntu22.04-cuda12.2.0/Dockerfile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
index 6e6ea2e7ce0..ae413def077 100644
--- a/docker/ubuntu22.04-cuda12.2.0/Dockerfile
+++ b/docker/ubuntu22.04-cuda12.2.0/Dockerfile
@@ -1,8 +1,8 @@
 FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
 LABEL maintainer="williamhilton.works@gmail.com"
 
-RUN apt update && \
-    apt install -y --no-install-recommends \
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
         build-essential \
         g++ \
         make \
@@ -22,12 +22,12 @@ RUN apt update && \
         patch \
         ffmpeg \
         vim && \
-    apt update && \
-    apt install -y --no-install-recommends\
+    apt-get update && \
+    apt-get install -y --no-install-recommends\
         software-properties-common && \
     apt-add-repository multiverse && \
-    apt update && \
-    yes | DEBIAN_FRONTEND=noninteractive apt install -yqq --no-install-recommends\
+    apt-get update && \
+    yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
         intel-mkl && \
     rm -rf /var/lib/apt/lists/*
 

From aef1d98603b68e6cf3a973e9dcd71915e2a175fe Mon Sep 17 00:00:00 2001
From: Egor Tyuvaev <egor.tyuvaev@intel.com>
Date: Fri, 13 Oct 2023 11:19:51 +0200
Subject: [PATCH 36/76] Update install_mkl.sh

Update Intel APT key to mitigate installation error
---
 tools/extras/install_mkl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index 8c1899bdf2f..ddcd372a02c 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -16,7 +16,7 @@ default_package=intel-mkl-64bit-2020.0-088
 
 yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo'
 apt_repo='https://apt.repos.intel.com/mkl'
-intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB'
+intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB'
 
 Usage () {
   cat >&2 <<EOF

From 6d8dd4c2337f224bb7f230cbb41d5e5311c75632 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Fri, 20 Oct 2023 11:48:09 +0000
Subject: [PATCH 37/76] Fix __CUDA_ARCH__ issue and add more hipification.

---
 src/chain/chain-kernels.cu                        |  1 +
 src/cudafeat/feature-online-cmvn-cuda.cu          |  1 +
 src/cudafeatbin/Makefile                          |  8 +++++---
 src/cudafeatbin/apply-batched-cmvn-online-cuda.cc |  2 ++
 .../compute-fbank-online-batched-cuda.cc          |  2 ++
 .../compute-mfcc-online-batched-cuda.cc           |  2 ++
 .../compute-online-feats-batched-cuda.cc          |  2 ++
 src/cudafeatbin/compute-online-feats-cuda.cc      |  2 ++
 src/cudamatrix/cu-kernels.cu                      |  1 +
 src/hip/hipify.h                                  |  5 +++++
 src/makefiles/hip_64bit.mk                        | 15 +++++++++++----
 11 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 2a30128750c..ad6691fc895 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -21,6 +21,7 @@
 #include "chain/chain-kernels-ansi.h"
 
 #ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
 #include <hip/hip_runtime.h>
 #endif
 
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index 8d4648d04bb..1c896f1307f 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -16,6 +16,7 @@
 // limitations under the License.
 
 #ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
 #include <hipcub/hipcub.hpp>
 #include "hipify.h"
 #else
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
index 9dbb5d30fa1..ed1c413c939 100644
--- a/src/cudafeatbin/Makefile
+++ b/src/cudafeatbin/Makefile
@@ -3,12 +3,14 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifndef CUDA_ARCH
-  $(error CUDA_ARCH is undefined, run 'src/configure')
+ifeq ($(CUDA), true)
+  ifndef CUDA_ARCH
+    $(error CUDA_ARCH is undefined, run 'src/configure')
+  endif
 endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
index 24e7cbd4a70..44ef403f21a 100644
--- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
+++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
@@ -18,8 +18,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
index 36cfc4ad90c..ff9415b8f11 100644
--- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
index 99883f3114a..3fcc1aea659 100644
--- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
index 787aceeca0d..2cd6bbb6a93 100644
--- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
@@ -16,9 +16,11 @@
 // limitations under the License.
 
 #if HAVE_CUDA
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #include <nvToolsExt.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
index b9135c3cee6..70380f8ccad 100644
--- a/src/cudafeatbin/compute-online-feats-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <nvToolsExt.h>
 #endif
+#endif
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "cudafeat/online-cuda-feature-pipeline.h"
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 1d6e0664541..1b0cf1f2c90 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -30,6 +30,7 @@
 #include <limits>
 #include <math_constants.h>
 #ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
 #include <hip/hip_runtime.h>
 #include "hipify.h"
 #include "cudamatrix/cu-kernels-ansi.h"
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index b631ac08a23..723b5b1f059 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -148,6 +148,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaHostRegister                          hipHostRegister
 #define cudaHostRegisterDefault                   hipHostRegisterDefault
 #define cudaHostUnregister                        hipHostUnregister
+#define cudaLaunchHostFunc                        hipLaunchHostFunc
 #define cudaMalloc                                hipMalloc
 #define cudaMallocHost                            hipHostMalloc
 #define cudaMallocPitch                           hipMallocPitch
@@ -157,12 +158,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice                    hipMemcpyHostToDevice
+#define cudaMemGetInfo                            hipMemGetInfo
 #define cudaMemset2DAsync                         hipMemset2DAsync
 #define cudaMemsetAsync                           hipMemsetAsync
+#define cudaProfilerStop                          hipProfilerStop
 #define cudaSetDevice                             hipSetDevice
 #define cudaStreamCreate                          hipStreamCreate
+#define cudaStreamCreateWithFlags                 hipStreamCreateWithFlags
 #define cudaStreamDestroy                         hipStreamDestroy
 #define cudaStreamLegacy                          ((hipStream_t)1)
+#define cudaStreamNonBlocking                      hipStreamNonBlocking
 #define cudaStreamPerThread                       ((hipStream_t)2)
 #define cudaStreamSynchronize                     hipStreamSynchronize
 #define cudaStreamWaitEvent                       hipStreamWaitEvent
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index e2f43ecd55c..8d85872aa9b 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -29,12 +29,19 @@ ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
                -I$(ROCMDIR)/rocrand/include \
                -I$(ROCMDIR)/include \
                -I.. -I../hip -isystem $(OPENFSTINC)
+               
+# TODO: Consider passing __CUDA_ARCH__=800 here as it is mostly supported by ROCm.
+#       However this macro has some side effect with HIPCC that makes it assume
+#       CUDA is active and everything is device compiles.
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
-             -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \
-	           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14
+             -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
+	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics
 
-#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+# TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+# We allow the libraries we link against to have undefined symbols so as this can be build in
+# systems with no development version of these libraries (e.g. ncurses).
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined 
+LDLIBS += -Wl,--allow-shlib-undefined 

From 3a8896c2a3bd13835e45b11eed6f2ce0044d5260 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Wed, 1 Nov 2023 20:13:34 +0800
Subject: [PATCH 38/76] Fix LatticeSimpleDecoder

---
 src/decoder/lattice-simple-decoder.cc | 2 +-
 src/gmm/mle-diag-gmm.h                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc
index cc8712e854d..d6b0727ef07 100644
--- a/src/decoder/lattice-simple-decoder.cc
+++ b/src/decoder/lattice-simple-decoder.cc
@@ -571,7 +571,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
   }
   if (queue.empty()) {
     if (!warned_) {
-      KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
+      KALDI_LOG << "Error in ProcessNonEmitting: no surviving tokens: frame is "
                 << frame;
       warned_ = true;
     }
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index d41d36489bf..3763943a89b 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -93,7 +93,7 @@ struct MapDiagGmmOptions {
   void Register(OptionsItf *opts) {
     opts->Register("mean-tau", &mean_tau,
                    "Tau value for updating means.");
-    opts->Register("variance-tau", &mean_tau,
+    opts->Register("variance-tau", &variance_tau,
                    "Tau value for updating variances (note: only relevant if "
                    "update-flags contains \"v\".");
     opts->Register("weight-tau", &weight_tau,

From cdbc05b0d3611618297a7d21017de5f10126fc7a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-60-19.ec2.internal>
Date: Fri, 3 Nov 2023 11:02:01 +0000
Subject: [PATCH 39/76] Fixes to work with CUDA 12 toolkit

---
 src/cudamatrix/cu-kernels.cu            | 73 ++++++++++++++-----------
 src/cudamatrix/cu-sparse-matrix-test.cc |  6 +-
 src/cudamatrix/cu-sparse-matrix.cc      | 65 +++++++++++++++-------
 src/cudamatrix/cu-sparse-matrix.h       | 21 +++----
 4 files changed, 99 insertions(+), 66 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8044ff699bc..7ffdc541113 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -953,11 +953,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   }
 
   // Warp reduce. Implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
+    __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1206,11 +1207,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+     if (tid < warpSize) {
+       ssum[tid] += ssum[tid + shift];
+     }
+     __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1257,12 +1259,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
+      if (tid < warpSize) {
+	ssum[tid] += ssum[tid + shift];
+      }
+      __syncwarp();
     }
-  }
 
   // output TileDim sums per thread block
   if (tid < TileDim) {
@@ -1340,13 +1343,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
+  for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
+    if (tid < warpSize) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
+    __syncwarp();
   }
-
   // output TileDim sums per thread block
   if (tid < TileDim && j_n < dim_N.cols) {
     v[j_n] = alpha * smem.sum[tid] + beta * v[j_n];
@@ -1793,10 +1796,11 @@ static void _vec_transform_reduce(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
     }
+    __syncwarp();
   }
 
   // Output to vector result.
@@ -2006,9 +2010,11 @@ static void _transform_reduce_mat_rows(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    }
+    __syncwarp();
   }
 
   // Output to vector result.
@@ -2045,11 +2051,13 @@ static void _transform_reduce_mat_cols(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    }    
+    __syncwarp();
   }
-
+  
   // Output to vector result.
   if (tid == 0) {
     result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2087,13 +2095,12 @@ static void _group_transform_reduce(
       x_idx += threads_per_group;
     }
     sreduction[tid] = treduction;
-    if (threads_per_group > warpSize) {
-      __syncthreads();
-    }
+    __syncthreads();
 
     // tree-reduce to 2x warpSize elements per group
 #   pragma unroll
-    for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) {
+    int shift = threads_per_group / 2;
+    for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
@@ -2101,14 +2108,12 @@ static void _group_transform_reduce(
     }
 
     // Warp-reduce to 1 element per group.
-    // Threads implicitly synchronized within the warp.
-    const int warp_reduce_size =
-        threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
-    if (threadIdx.x < warp_reduce_size) {
 #     pragma unroll
-      for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
+    for (; shift > 0; shift >>= 1) {
+      if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
+      __syncwarp();
     }
 
     // Store the result.
@@ -2967,12 +2972,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
   }
 
   // reduce to 1 element per row
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sprod[tid] += sprod[tid + shift];
       snorm[tid] += snorm[tid + shift];
     }
+    __syncwarp();
   }
 
   // broadcast the sum results
@@ -3254,15 +3260,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
   }
   // Warp reduce without __syncthreads()
   // (note.: synchronizes implicitly within a warp at the multiprocessor)
-  if (tid < warpSize / 2) {
 #pragma unroll
-    for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
-        num_working_threads >>= 1) {
+  for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
+      num_working_threads >>= 1) {
+    if (tid < warpSize / 2) {
       if (smax[tid + num_working_threads] > smax[tid]) {
         smax[tid] = smax[tid + num_working_threads];
         sidx[tid] = sidx[tid + num_working_threads];
       }
     }
+    __syncwarp();
   }
 
   if (tid == 0) {
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index aad34b5dd54..0c2230a8731 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() {
 template <typename Real>
 static void UnitTestCuSparseMatrixTraceMatSmat() {
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT row = 10 + Rand() % 40;
-    MatrixIndexT col = 10 + Rand() % 50;
+    MatrixIndexT row = 2 + Rand() % 3;
+    MatrixIndexT col = 1 + Rand() % 4;
 
     CuMatrix<Real> mat1(row, col);
     CuMatrix<Real> mat2(col, row);
@@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() {
     cu_smat2.CopyToMat(&mat2);
 
     Real trace1 = TraceMatMat(mat3, mat1, kTrans);
+
     Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans);
     AssertEqual(trace1, trace2, 0.00001);
 
     trace1 = TraceMatMat(mat3, mat2, kNoTrans);
     trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans);
+
     AssertEqual(trace1, trace2, 0.00001);
   }
 }
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 703aa40e735..f24613fa231 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -161,7 +161,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
 template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes, int32 dim,
                                      MatrixTransposeType trans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
     NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -194,8 +194,8 @@ template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes,
                                      const CuVectorBase<Real> &weights,
                                      int32 dim, MatrixTransposeType trans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
-    NULL) {
+  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), 
+  csr_val_(NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined);
@@ -266,8 +266,9 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = 0;
       num_cols_ = 0;
       nnz_ = 0;
-      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
           1 * sizeof(int)));
+      csr_col_idx_ = NULL;   // may be freed, but this is allowed.
       csr_val_ = NULL;
     } else {
       KALDI_ASSERT(num_rows > 0);
@@ -277,10 +278,16 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = num_rows;
       num_cols_ = num_cols;
       nnz_ = nnz;
-      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
-          (num_rows + 1 + nnz) * sizeof(int)));
-      csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int)));
+      if (nnz > 0) {
+	csr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+          nnz * sizeof(int)));
+	csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
           nnz * sizeof(Real)));
+      } else {
+	csr_col_idx_ = NULL;
+	csr_val_ = NULL;
+      }
       CuSubArray<int> row_ptr(CsrRowPtr(), NumRows() + 1);
       row_ptr.Set(nnz);
       if (resize_type == kSetZero) {
@@ -302,8 +309,11 @@ void CuSparseMatrix<Real>::Destroy() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    if (csr_row_ptr_col_idx_) {
-      CuDevice::Instantiate().Free(csr_row_ptr_col_idx_);
+    if (csr_row_ptr_) {
+      CuDevice::Instantiate().Free(csr_row_ptr_);
+    }
+    if (csr_col_idx_) {
+      CuDevice::Instantiate().Free(csr_col_idx_);
     }
     if (csr_val_) {
       CuDevice::Instantiate().Free(csr_val_);
@@ -311,7 +321,8 @@ void CuSparseMatrix<Real>::Destroy() {
     num_rows_ = 0;
     num_cols_ = 0;
     nnz_ = 0;
-    csr_row_ptr_col_idx_ = NULL;
+    csr_row_ptr_ = NULL;
+    csr_col_idx_ = NULL;    
     csr_val_ = NULL;
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -378,11 +389,17 @@ void CuSparseMatrix<Real>::CopyFromSmat(const CuSparseMatrix<Real>& smat,
       CuSubVector<Real> val_from(smat.CsrVal(), smat.NumElements());
       val_to.CopyFromVec(val_from);
 
-      CuSubArray<int> idx_to(csr_row_ptr_col_idx_,
-                             NumRows() + 1 + NumElements());
-      CuSubArray<int> idx_from(smat.csr_row_ptr_col_idx_,
-                               smat.NumRows() + 1 + smat.NumElements());
-      idx_to.CopyFromArray(idx_from);
+      {
+	CuSubArray<int> idx_to(csr_row_ptr_, NumRows() + 1);
+	CuSubArray<int> idx_from(smat.csr_row_ptr_, NumRows() + 1);
+	idx_to.CopyFromArray(idx_from);
+      }
+
+      {
+	CuSubArray<int> idx_to(csr_col_idx_, NumElements());
+	CuSubArray<int> idx_from(smat.csr_col_idx_, NumElements());
+	idx_to.CopyFromArray(idx_from);
+      }
 
     } else {
       Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined);
@@ -413,9 +430,14 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
       smat->Resize(0, 0);
       return;
     }
-    CuSubArray<int> idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements());
-    std::vector<int> idx_cpu;
-    idx.CopyToVec(&idx_cpu);
+    CuSubArray<int> row_ptr(csr_row_ptr_, NumRows() + 1);
+    std::vector<int> row_ptr_cpu;
+    row_ptr.CopyToVec(&row_ptr_cpu);
+
+
+    CuSubArray<int> col_idx(csr_col_idx_, NumElements());
+    std::vector<int> col_idx_cpu;
+    col_idx.CopyToVec(&col_idx_cpu);
 
     CuSubVector<Real> val(CsrVal(), NumElements());
     Vector<OtherReal> val_cpu(NumElements(), kUndefined);
@@ -425,8 +447,8 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
         NumRows());
     int n = 0;
     for (int i = 0; i < NumRows(); ++i) {
-      for (; n < idx_cpu[i + 1]; ++n) {
-        const MatrixIndexT j = idx_cpu[NumRows() + 1 + n];
+      for (; n < row_ptr_cpu[i + 1]; ++n) {
+        const MatrixIndexT j = col_idx_cpu[n];
         pairs[i].push_back( { j, val_cpu(n) });
       }
     }
@@ -484,7 +506,8 @@ void CuSparseMatrix<Real>::Swap(CuSparseMatrix<Real> *smat) {
     std::swap(num_rows_, smat->num_rows_);
     std::swap(num_cols_, smat->num_cols_);
     std::swap(nnz_, smat->nnz_);
-    std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_);
+    std::swap(csr_row_ptr_, smat->csr_row_ptr_);
+    std::swap(csr_col_idx_, smat->csr_col_idx_);
     std::swap(csr_val_, smat->csr_val_);
   } else
 #endif
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index 82b17a0dc71..180beed6183 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -121,13 +121,13 @@ class CuSparseMatrix {
 
   /// Default constructor
   CuSparseMatrix() :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
           NULL) {
   }
 
   /// Constructor from CPU-based sparse matrix.
   explicit CuSparseMatrix(const SparseMatrix<Real> &smat) :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat);
   }
@@ -135,7 +135,7 @@ class CuSparseMatrix {
   /// Constructor from GPU-based sparse matrix (supports transposition).
   CuSparseMatrix(const CuSparseMatrix<Real> &smat, MatrixTransposeType trans =
                      kNoTrans) :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat, trans);
   }
@@ -200,19 +200,19 @@ class CuSparseMatrix {
   /// indices of the first nonzero element in the i-th row, while the last entry
   /// contains nnz_, as zero-based CSR format is used.
   const int* CsrRowPtr() const {
-    return csr_row_ptr_col_idx_;
+    return csr_row_ptr_;
   }
   int* CsrRowPtr() {
-    return csr_row_ptr_col_idx_;
+    return csr_row_ptr_;
   }
 
   /// Returns pointer to the integer array of length nnz_ that contains
   /// the column indices of the corresponding elements in array CsrVal()
   const int* CsrColIdx() const {
-    return csr_row_ptr_col_idx_ + num_rows_ + 1;
+    return csr_col_idx_;
   }
   int* CsrColIdx() {
-    return csr_row_ptr_col_idx_ + num_rows_ + 1;
+    return csr_col_idx_;
   }
 
 private:
@@ -238,9 +238,10 @@ class CuSparseMatrix {
   // number of non-zeros
   MatrixIndexT nnz_;
 
-  // csr row ptrs and col indices in a single int array
-  // of the length (num_rows_ + 1 + nnz_)
-  int* csr_row_ptr_col_idx_;
+  // length num_rows_ + 1
+  int* csr_row_ptr_;
+  // length nnz_
+  int* csr_col_idx_;
 
   // csr value array of the length nnz_
   Real* csr_val_;

From fe127209cc0d653cef80a5e81487a95f0405de32 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Sat, 4 Nov 2023 18:01:27 +0100
Subject: [PATCH 40/76] Openblas repo was renamed

---
 tools/extras/install_openblas.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index ce0fdf7fbdb..521d096adbd 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -19,18 +19,18 @@ fi
 
 tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
 
-rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
+rm -rf OpenMathLib-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
 
 if [ -d "$DOWNLOAD_DIR" ]; then
   cp -p "$DOWNLOAD_DIR/$tarball" .
 else
-  url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
+  url=$($WGET -qO- "https://api.github.com/repos/OpenMathLib/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python3 -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
   test -n "$url"
   $WGET -t3 -nv -O $tarball "$url"
 fi
 
 tar xzf $tarball
-mv xianyi-OpenBLAS-* OpenBLAS
+mv OpenMathLib-OpenBLAS-* OpenBLAS
 
 make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
 if [ $? -eq 0 ]; then

From f584420d8c1448e8e70f9106aa49712f63d06347 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 16:57:15 +0000
Subject: [PATCH 41/76] Fix tests with zero size matrices and needing syncwarp
 for LDS sharing.

---
 ...ure-online-batched-ivector-cuda-kernels.cu |  30 +++--
 ...re-online-batched-spectral-cuda-kernels.cu |   4 +-
 src/cudafeat/feature-online-cmvn-cuda.cu      |   4 +-
 src/cudafeat/feature-spectral-cuda.cu         |   4 +-
 .../online-ivector-feature-cuda-kernels.cu    |  26 ++--
 src/cudamatrix/cu-device.cc                   |   4 +
 src/cudamatrix/cu-kernels.cu                  | 127 ++++++++++++++----
 src/cudamatrix/cu-math-test.cc                |  11 +-
 src/cudamatrix/cu-math.cc                     |   2 +-
 src/cudamatrix/cu-matrix-test.cc              |  24 +++-
 src/cudamatrix/cu-matrix.cc                   |  12 +-
 src/cudamatrix/cu-sparse-matrix.cc            |   6 +-
 src/cudamatrix/cu-vector.cc                   |  13 +-
 src/hip/hipify.h                              |  35 ++++-
 src/makefiles/hip_64bit.mk                    |   7 +-
 15 files changed, 219 insertions(+), 90 deletions(-)

diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index 0b4cfce812c..e5b89d163e5 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -50,7 +50,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols,
                            const float *feats, int32_t ldf, int32_t stridef,
                            float *feats_sq, int32_t lds, int32_t strides,
                            const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (chunk_frames + threads.y - 1) / threads.y, num_lanes);
 
@@ -101,8 +101,10 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss,
                              float *posteriors, int32_t ldp, int32_t stridep,
                              int32_t right, const LaneDesc *lanes,
                              int32_t num_lanes) {
-  dim3 threads(32, 32);
-  dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, 
+              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
+              num_lanes);
 
   zero_invalid_posteriors_kernel<<<blocks, threads>>>(
       num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes,
@@ -215,8 +217,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
                              int32_t stridest, float *spliced_feats,
                              int32_t lds, int32_t strides,
                              const LaneDesc *lanes, int32_t num_lanes) {
-  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
-  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is 1024 threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
@@ -311,10 +313,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
     // First we need to shift feats to handle the case where num_chunk_frames
     // is less than stash size
 
-    KALDI_ASSERT(stash_size <= 32);
-    // This only works if stash size is <= 32 as we rely on __syncthreads()
+    KALDI_ASSERT(stash_size <= GPU_WARP_SIZE);
+    // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads()
     // to avoid read/write hazards when reading/writing in-place
-    dim3 threads(32, 32);
+    dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
     dim3 blocks(num_lanes);
 
     shift_feats_kernel<<<blocks, threads>>>(chunk_size, feats, feat_dim, ldf,
@@ -324,8 +326,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
 
   {
     int threads =
-        (feat_dim + 31) / 32 * 32;       // round up to the nearest warp size
-    if (threads > 1024) threads = 1024;  // Max block size is 1024 threads
+        (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;       // round up to the nearest warp size
+    if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
     dim3 blocks(stash_size, num_lanes);
 
     // Then we need to copy feats from source into stash
@@ -507,8 +509,8 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp,
 void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A,
                                  int32_t lda, int32_t stridea,
                                  const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(32, 32);
-  int block = (n + 31) / 32;  // blocks in x and y dimensions
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
   dim3 blocks(block, block, num_lanes);
 
   batched_convert_sp_to_dense_kernel<<<blocks, threads>>>(
@@ -584,7 +586,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma,
                          int32_t strideg, float *X, int32_t ldx,
                          int32_t stridex, const LaneDesc *lanes,
                          int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t blocks = num_lanes;
 
   initialize_channels_kernel<<<blocks, threads>>>(
@@ -629,7 +631,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma,
                             int32_t ldx, int32_t stridex, float *X_stash,
                             int32_t lds, int32_t strides, const LaneDesc *lanes,
                             int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t blocks = num_lanes;
 
   apply_and_update_stash_kernel<<<blocks, threads>>>(
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index f847311d755..27375f4914e 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -68,7 +68,7 @@ __global__ void batched_mel_banks_compute_kernel(
   // perfom local sum
   float sum = 0;
   if (frame < num_frames) {  // exclude frames beyond the end
-    for (int idx = tid; idx < size; idx += 32) {
+    for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
       sum += v[idx] * w[idx];
     }
   }
@@ -487,7 +487,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes,
                             float energy_floor, int32 *offsets, int32 *sizes,
                             float **vecs, const float *feats, int32_t ldf,
                             float *mels, int32_t ldm, bool use_log) {
-  dim3 Bl(32, 8);
+  dim3 Bl(GPU_WARP_SIZE, 8);
   dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes);
   batched_mel_banks_compute_kernel<<<Gr, Bl>>>(
       lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs,
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index 1c896f1307f..f8947a3b5ed 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -188,8 +188,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
       stats.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 
-  threads = (feat_dim + 31) / 32 * 32;  // round up to 32 threads
-  if (threads > 1024) threads = 1024;
+  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;
 
   const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
   const CuMatrix<float> &sstats = cmvn_state_.speaker_cmvn_stats;
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index c320c85a029..9c0d5df5288 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -134,7 +134,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
 
   // perfom local sum
   float sum = 0;
-  for (int idx = tid; idx < size; idx += 32) {
+  for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
     sum += v[idx] * w[idx];
   }
 
@@ -493,7 +493,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w
   // mel banks
   int num_bins = bin_size_;
   cu_mel_energies_.Resize(num_frames, num_bins, kUndefined);
-  dim3 mel_threads(32, 8);
+  dim3 mel_threads(GPU_WARP_SIZE, 8);
   dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y);
   mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
       num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index 378ea18e689..dffc9fd3c8f 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -26,17 +26,17 @@
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
 
-// Meant to be called with blockDim= 32x32
+// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK
 __global__ void batched_gemv_reduce_kernel(int rows, int cols,
                                            const float* __restrict__ A, int lda,
                                            const float* __restrict__ X, int ldx,
                                            float* C) {
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
-  // Allocate WarpReduce shared memory for 32 warps
-  __shared__ typename WarpReduce::TempStorage temp_storage[32];
+  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
 
-  __shared__ float s_A[32][32 + 1];  //+1 to avoid bank conflicts on transpose
+  __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
 
   int bid = blockIdx.x;   // batch id
   int tid = threadIdx.x;  // thread id
@@ -47,13 +47,13 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Offset to input vector to starting column for batch
   const float* __restrict__ X_in = X + bid * ldx;
 
-  for (int i = 0; i < cols; i += 32) {  // threadIdx.x, keep all threads present
+  for (int i = 0; i < cols; i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
     int c = i + tid;
 
     float sum = 0.0f;
     // Perform dot product
     for (int j = 0; j < rows;
-         j += 32) {  // threadIdx.y, keep all threads present
+         j += GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
       int r = j + wid;
 
       float val = 0.0f;
@@ -139,9 +139,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
                                                     int32_t lda, float scale,
                                                     float* retval) {
   // Specialize WarpReduce for type float
-  typedef cub::BlockReduce<float, 32, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 32>
+  typedef cub::BlockReduce<float, GPU_WARP_SIZE, cub::BLOCK_REDUCE_WARP_REDUCTIONS, GPU_MAX_WARPS_PER_BLOCK>
       BlockReduce;
-  // Allocate WarpReduce shared memory for 32 warps
+  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   float sum = 0.0f;
@@ -207,7 +207,7 @@ __global__ void update_linear_and_quadratic_terms_kernel(
 void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
                          const float* AT, int B_stride, const float* B,
                          float* C) {
-  batched_gemv_reduce_kernel<<<batch_size, dim3(32, 32)>>>(
+  batched_gemv_reduce_kernel<<<batch_size, dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
       rows, cols, AT, A_stride, B, B_stride, C);
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -215,8 +215,8 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
 void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
                      int32_t size, const float* feats, int32_t ldf,
                      float* sfeats, int32_t lds) {
-  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
-  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
 
   splice_features_kernel<<<num_frames, threads>>>(
       num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
@@ -238,7 +238,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames,
 void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
                                   float* A, int32_t lda, float scale,
                                   float* sum) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
@@ -249,7 +249,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
 
 void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats,
                    int32_t ldf, float* feats_sq, int32_t lds) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 3dada172ba8..25775fb1b05 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -249,8 +249,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Suggest to use compute exclusive mode
+  #ifdef __IS_HIP_COMPILE__
+    KALDI_WARN << "Not in compute-exclusive mode.";
+  #else 
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
+  #endif
     // We want to choose the device more carefully, so release the CUDA context.
     e = cudaDeviceReset();
     if (e != cudaSuccess) {
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 1b0cf1f2c90..792932c18d5 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -966,6 +966,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       smem.sum[tid] += smem.sum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1118,8 +1119,8 @@ void trace_mat_mat_trans_atomic(Real *d_result,
                                 cudaStream_t stream) {
   // Assuming *d_result is set to zero already
 
-  constexpr int THREADS_X = 32;
-  constexpr int THREADS_Y = 16;
+  constexpr int THREADS_X = GPU_WARP_SIZE;
+  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2;
 
   dim3 thrds(THREADS_X, THREADS_Y);
 
@@ -1176,6 +1177,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1219,6 +1221,7 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1270,6 +1273,7 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1353,6 +1357,7 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
       smem.sum[tid] += smem.sum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1805,6 +1810,7 @@ static void _vec_transform_reduce(
   if (tid < warpSize) {
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+      __syncwarp();
     }
   }
 
@@ -1904,7 +1910,6 @@ __global__ void _strided_reduction_fused_kernel(Real * __restrict__ dots, const
         int idx = colStart + (j + u*stride) * d.stride;
         vals[u] = op.Transform(data[idx]);
       }
-
       #pragma unroll
       for (int u = 0; u < unroll_count; ++u) {
         thread_data = op.Reduce(thread_data, vals[u]);
@@ -2018,6 +2023,7 @@ static void _transform_reduce_mat_rows(
   if (tid < warpSize) {
     for (int shift = warpSize; shift > 0; shift >>= 1)
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+      __syncwarp();
   }
 
   // Output to vector result.
@@ -2042,9 +2048,27 @@ static void _transform_reduce_mat_cols(
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
   }
+
+  // if (tid == 0) {
+  //   for (int j = 0; j < d.cols; j += 1)
+  //     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
+  //   result[i] = tdata;
+    
+  // }
+  // return;
+
   sdata[tid] = tdata;
   __syncthreads();
 
+  // if (tid == 0) {
+  //   tdata = 0;
+  //   for (int j = 0; j < CU1DBLOCK; j += 1)
+  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
+  //   result[i] = tdata;
+  // }
+
+  // return;
+
   // Tree reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -2053,12 +2077,30 @@ static void _transform_reduce_mat_cols(
     __syncthreads();
   }
 
+  // if (tid == 0) {
+  //   tdata = 0;
+  //   for (int j = 0; j < 2*warpSize; j += 1)
+  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
+  //   result[i] = tdata;
+  // }
+
+  // return;
+
+
   // Reduce last warp. Threads implicitly synchronized within a warp.
   if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
-      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      sdata[tid] +=  sdata[tid + shift];
+      __syncwarp();
+      //__syncthreads(); // Why this needed?
+    }
   }
 
+  if (tid == 0)
+    result[i] = sdata[0];
+
+  return;
+
   // Output to vector result.
   if (tid == 0) {
     result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2117,6 +2159,7 @@ static void _group_transform_reduce(
 #     pragma unroll
       for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
+        __syncwarp();
       }
     }
 
@@ -2981,6 +3024,7 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       sprod[tid] += sprod[tid + shift];
       snorm[tid] += snorm[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -3271,6 +3315,7 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
         smax[tid] = smax[tid + num_working_threads];
         sidx[tid] = sidx[tid + num_working_threads];
       }
+      __syncwarp(0xffffffffu >> (32-num_working_threads));
     }
   }
 
@@ -3999,7 +4044,7 @@ struct  BatchedMatrixCopyDesc {
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
-// launched with a block size of 32x32 (32 rows, 32 cols per CTA)
+// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA)
 // grid dim x,y expands to fill out average in x/y across batches
 // grid dim.z is batch
 template<typename Real>
@@ -4380,7 +4425,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B,
 
 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
                          MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
@@ -4401,6 +4446,11 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
+                                           v, stride_v);
+#endif
   }
 }
 
@@ -4409,9 +4459,13 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
                                const float* N, const MatrixDim dim_N,
                                const float beta, float* v) {
   if (Bl.x == 16) {
-    _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<16><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
     _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x==64) {
+    _add_diag_mat_mat_MN<64><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#endif
   }
 }
 
@@ -4451,6 +4505,7 @@ void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl, float *data, int dim,
                                          transpose, elements);
 }
 
+
 void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
                           const float* z, MatrixDim d, float* z2, MatrixDim d2,
                           float* t) {
@@ -5086,7 +5141,7 @@ void cudaD_trace_mat_mat_trans(const double* A,
 
 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
                          MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
@@ -5107,6 +5162,11 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
+                                           v, stride_v);
+#endif
   }
 }
 
@@ -5115,9 +5175,13 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
                                const double* N, const MatrixDim dim_N,
                                const double beta, double* v) {
   if (Bl.x == 16) {
-    _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<16><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
     _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x==64) {
+    _add_diag_mat_mat_MN<64><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+#endif
   }
 }
 
@@ -5488,25 +5552,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
@@ -5802,7 +5866,14 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 // Launches a kernel that does nothing, explicitly using the legacy default stream;
 // this will synchronize all threads without blocking.
 void cuda_legacy_noop() {
+#ifdef __IS_HIP_COMPILE__
+  // HIP doesn't currently support cudaStreamLegacy stream so we force to use the
+  // non-per-thread API to get similar semantics.
+  auto k = reinterpret_cast<void*>(_noop_kernel);
+  hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0);
+#else
   _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
+#endif
 }
 
 void cudaF_mat_copy_range_clamped(
@@ -5812,8 +5883,8 @@ void cudaF_mat_copy_range_clamped(
    float *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(32,32);
-  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5826,8 +5897,8 @@ void cudaD_mat_copy_range_clamped(
    double *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(32,32);
-  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5837,7 +5908,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
     int32_t *ldo) {
 
-  dim3 threads(32,32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<float> batch_desc; 
@@ -5863,8 +5934,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5886,8 +5957,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
       
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5902,7 +5973,7 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
     int32_t *ldo) {
 
-  dim3 threads(32,32);
+  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<double> batch_desc; 
@@ -5928,8 +5999,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
@@ -5951,8 +6022,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
 
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
+      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
                   remaining);
       
       // no memcpy needed here.  Memory will be passed down directly
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index e1d59e777be..1245fb28bad 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() {
   for (int32 loop = 0; loop < 10; loop++) {
 
     // problem dimensions.
-    int32 num_rows = RandInt(5, 20),
-          cell_dim = RandInt(2, 200),
-        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
+    int32 num_rows = RandInt(5, 20), //16
+          cell_dim = RandInt(2, 200), //45
+        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3
 
     // Pick the (input or params block), and output block, for which we'll
     // spot-check the derivative values.  This will give us test failures
@@ -232,7 +232,6 @@ void UnitTestLstmNonlinearity() {
     else
       test_params = -1;
 
-
     CuMatrix<BaseFloat> input(num_rows, cell_dim * 5 + dropout_dim),
         params(3, cell_dim),
         output_deriv(num_rows, cell_dim * 2);
@@ -277,11 +276,11 @@ void UnitTestLstmNonlinearity() {
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim + dropout_dim),
           delta_params(3, cell_dim);
-      if (test_input >= 0) {
+      if (test_input >= 0) { // -1
         delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
         delta_input.Scale(delta);
       }
-      if (test_params >= 0) {
+      if (test_params >= 0) { // 0
         delta_params.Row(test_params).SetRandn();
         delta_params.Scale(delta);
       }
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 3fbeff3a470..d0d8e4e771f 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
 //    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
 //                 n_blocks(num_rows, dimBlock.y));
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index be8483e48f5..26a5281ec05 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2675,10 +2675,18 @@ static void UnitTestCuMatrixSetRandn() {
 
 template <typename Real>
 static void UnitTestCuMatrixSetRandUniform() {
+
+  // if (CuDevice::Instantiate().Enabled()) {
+  //   CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456));
+  // }
+
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200;
+    MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200;
     CuMatrix<Real> M(rows, cols);
     M.SetRandUniform();
+    // M.SetZero();
+    // M.Add(0.5);
+    // M.SetZeroAboveDiag();
 
     M.Add(-0.5); // we'll be testing the central moments, so
     // center it around zero first.
@@ -2693,6 +2701,16 @@ static void UnitTestCuMatrixSetRandUniform() {
     for (int32 pow = 1; pow < central_moments.Dim(); pow++) {
       CuMatrix<Real> Mpow(M);
       Mpow.ApplyPow(pow);
+
+      // if (CuDevice::Instantiate().Enabled()) {
+      //   CuVector<Real> col_sum(rows, kUndefined);
+      //   cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim());
+      //   KALDI_LOG << "Sums vector is " << col_sum;
+      //   Real ans = col_sum.Sum();
+      //   KALDI_LOG << "Total sum is " << ans;
+      //   KALDI_ERR << "Stopping!";
+      // }
+
       Real observed_moment = Mpow.Sum() / (rows * cols);
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
@@ -2705,10 +2723,12 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
+        //KALDI_LOG << "Random vector sum is " << col_sum;
         KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
+      KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")";
     }
   }
 }
@@ -3061,7 +3081,7 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   SetVerboseLevel(1);
   int32 loop = 0;
-  bool test_threads = true;
+  bool test_threads = false;
   // num_threads only matters if test_threads == true.   Don't make it
   // to large, because it will affect CPU usage if you are using CPU.
   int32 num_threads = 4;
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 9897917a33f..56acf340823 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -253,7 +253,7 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       } else {
         // 2D thread block with warps (blockDim.x) along the row-dim of input M.
         // Each (8x32) thread block will transpose (32x32) data
-        const int32 warpSize = 32;
+        const int32 warpSize = GPU_WARP_SIZE;
         dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
         dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
             n_blocks(M.NumRows(), warpSize));
@@ -859,7 +859,7 @@ void CuMatrixBase<Real>::DiffGroupPnorm(const CuMatrixBase<Real> &in_value,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
                  n_blocks(NumRows(), dimBlock.y));
@@ -1009,7 +1009,7 @@ void CuMatrixBase<Real>::AddSmat(Real alpha, const CuSparseMatrix<Real> &A,
     // We use warpSize threads per row to access only the nonzero elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of A.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y));
 
@@ -2186,7 +2186,7 @@ Real TraceMatMat(const CuMatrixBase<Real> &A,
     // if the matrix is not in a very bad shape.
     // (wider or taller than 32x8192)
     // CPU will then reduce to 1 element.
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
         n_blocks(A.NumRows(), kWarpSize));
@@ -2408,7 +2408,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
       // and use transposed copy to fill *this
       // see CuMatrixBase<Real>::CopyFromMat() for more detail of the impl
       MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
-      const int32 warpSize = 32;
+      const int32 warpSize = GPU_WARP_SIZE;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(rv_dim.cols, warpSize),
                    n_blocks(rv_dim.rows, warpSize));
@@ -2418,7 +2418,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
     } else if (rv.Dim() == num_rows_) {
       // use 2D block (8x32) and large enough grid to cover matrix *this
       // dimBlock.x need to be at least warpSize for coalesced memory access.
-      const int32 warpSize = 32;
+      const int32 warpSize = GPU_WARP_SIZE;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
                    n_blocks(num_rows_, dimBlock.y));
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 93d10099466..1a82ce0d4df 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -148,7 +148,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all selected rows.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y));
 
@@ -558,7 +558,7 @@ Real TraceMatSmat(const CuMatrixBase<Real> &A,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of B.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y));
 
@@ -648,7 +648,7 @@ void CuSparseMatrix<Real>::CopyToMat(CuMatrixBase<OtherReal> *M,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(NumRows(), dimBlock.y));
 
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 1deb1cb8733..f6426297e49 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -639,7 +639,10 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
                                   N.Data(), N.Stride(), beta, data_);
       } else {
         // Case 2: diag(M'*N) == sum(M.*N, 1)
-        // 16x16 or 8x32 2D block for coalesced memory access.
+        // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2
+        // or
+        // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE
+        // 2D block for coalesced memory access.
         // Grid shape is designed as follows,
         // 1. for small matrices, use 1D grid with only 1 row of 16x16 block,
         //    to avoid multiple kernel launch;
@@ -647,11 +650,11 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         //    use 1- or 2-D grid so that the grid contains
         //    at least and not much larger than 'kOptNumBlocks' blocks
         //    to fully utilize the GPU;
-        const int32 warpSize = 32;
+        const int32 warpSize = GPU_WARP_SIZE;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
             (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
-                16 : 32;
+                GPU_WARP_SIZE/2 : GPU_WARP_SIZE;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
@@ -678,7 +681,7 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
         int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
+            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
@@ -687,7 +690,7 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
         int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
+            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 723b5b1f059..56d7e869a32 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -2,7 +2,19 @@
 #define __HIPIFY_H__
 
 #ifdef __HIPCC__
-inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
+inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
+    // On CDNA hardware wave-fronts (warps) execute always in
+    // lock step. Though it might still be important to signal
+    // that the compiler can't reorder code around certain code 
+    // sections that rely on data sharing mecanisms like LDS 
+    // (shared memory). So this implements a No-op but is seen
+    // by the compiler as having side effects. 
+    __asm__("s_nop 0");
+
+    // A saffest option, arguably less performant would be to use:
+    // __asm__("s_waitcnt lgkmcnt(0)"); Í
+    // to explicitly do a memory fence. 
+}
 // AMDGCN only support this rounding mode.
 #define __fdiv_rd __fdiv_rn
 #else
@@ -153,7 +165,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaMallocHost                            hipHostMalloc
 #define cudaMallocPitch                           hipMallocPitch
 #define cudaMemcpy                                hipMemcpy
-#define cudaMemcpy2DAsync                         hipMemcpy2DAsync
+// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized 
+// copies, which should be canceled by ROCm 5.7.1+. Then the following would
+// be sufficient:
+// #define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \
+    [&]() -> hipError_t { \
+        if (width && height) \
+            return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \
+        return hipSuccess; \
+    }()
 #define cudaMemcpyAsync                           hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
@@ -166,8 +187,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 #define cudaStreamCreate                          hipStreamCreate
 #define cudaStreamCreateWithFlags                 hipStreamCreateWithFlags
 #define cudaStreamDestroy                         hipStreamDestroy
-#define cudaStreamLegacy                          ((hipStream_t)1)
-#define cudaStreamNonBlocking                      hipStreamNonBlocking
+#define cudaStreamNonBlocking                     hipStreamNonBlocking
 #define cudaStreamPerThread                       ((hipStream_t)2)
 #define cudaStreamSynchronize                     hipStreamSynchronize
 #define cudaStreamWaitEvent                       hipStreamWaitEvent
@@ -243,6 +263,13 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {}
 //
 #define cub hipcub
 
+//
+// Callback qualifier
+//
+#define CUDART_CB
 
+#define GPU_WARP_SIZE 64
+#define GPU_MAX_THREADS_PER_BLOCK 1024
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE)
 #endif //__HIPIFY_H__
 
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index 8d85872aa9b..aec3e359f53 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -37,11 +37,14 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
              -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
-	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics
+	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics  \
+             $(EXTRA_ROCM_FLAGS)
+             
 
 # TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
 # We allow the libraries we link against to have undefined symbols so as this can be build in
 # systems with no development version of these libraries (e.g. ncurses).
 CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
 CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined 
-LDLIBS += -Wl,--allow-shlib-undefined 
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)

From ba4e18fcb2987b7172057aa5fc2613a9e1c1f2f8 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 17:14:29 +0000
Subject: [PATCH 42/76] Move misplaced #pragma unroll.

---
 src/cudamatrix/cu-kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ac532790b86..349b21b6591 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2135,8 +2135,8 @@ static void _group_transform_reduce(
     __syncthreads();
 
     // tree-reduce to 2x warpSize elements per group
-#   pragma unroll
     int shift = threads_per_group / 2;
+#   pragma unroll
     for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);

From dac0b272cfff3fba9be4b3cfdd2767271e0d4760 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 23:46:48 +0000
Subject: [PATCH 43/76] Working version trimmed of legacy ROCm < 5.2 code.

---
 .gitignore                                    |   4 -
 src/chain/Makefile                            |  12 --
 src/configure                                 |  22 +--
 src/cudadecoder/Makefile                      |  12 --
 src/cudadecoder/cuda-decoder.cc               |   2 +-
 src/cudafeat/Makefile                         |  12 --
 .../feature-online-batched-ivector-cuda.cc    |  38 -----
 .../feature-online-batched-spectral-cuda.h    |   4 -
 src/cudafeat/feature-online-cmvn-cuda.cu      |   1 +
 src/cudafeat/feature-spectral-cuda.h          |   4 -
 src/cudafeat/online-ivector-feature-cuda.cc   |  20 +--
 src/cudamatrix/Makefile                       |  12 --
 src/cudamatrix/cu-allocator.cc                |   4 -
 src/cudamatrix/cu-allocator.h                 |   4 -
 src/cudamatrix/cu-block-matrix.cc             |   4 -
 src/cudamatrix/cu-common.h                    |   5 -
 src/cudamatrix/cu-compressed-matrix.cc        |   4 -
 src/cudamatrix/cu-device.cc                   |   5 +-
 src/cudamatrix/cu-device.h                    |   9 --
 src/cudamatrix/cu-kernels.cu                  |  33 +---
 src/cudamatrix/cu-math-test.cc                |  11 +-
 src/cudamatrix/cu-matrix-test.cc              |  30 +---
 src/cudamatrix/cu-matrix.cc                   |   4 -
 src/cudamatrix/cu-packed-matrix.cc            |   4 -
 src/cudamatrix/cu-sp-matrix.cc                |   4 -
 src/cudamatrix/cu-sparse-matrix.cc            |   4 -
 src/cudamatrix/cu-tp-matrix.cc                |   4 -
 src/cudamatrix/cu-vector.cc                   |   4 -
 src/hip/hipify.h                              |  12 --
 src/hip/math_constants.h                      | 152 ------------------
 src/makefiles/hip_64bit.mk                    |   3 +
 31 files changed, 29 insertions(+), 414 deletions(-)
 delete mode 100644 src/hip/math_constants.h

diff --git a/.gitignore b/.gitignore
index 53a4079d9ef..9f8c727d4d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,7 +90,3 @@ venv/
 # CMakeLists.txt files are currently autogenerated, must not be committed.
 /src/**/CMakeLists.txt
 /build*
-
-# Eclipse sync project
-.ptp-sync
-.ptp-sync-folder
diff --git a/src/chain/Makefile b/src/chain/Makefile
index 5b177981ad8..dbe6c38709f 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -33,21 +33,9 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 endif
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
-endif
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/configure b/src/configure
index 5f9c48a6cde..37a75a5cade 100755
--- a/src/configure
+++ b/src/configure
@@ -295,12 +295,11 @@ function configure_rocm {
   ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
   echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk
   
-  # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use 
-  # __HIP_PLATFORM_AMD__ others  __HIP_PLATFORM_HCC__
-  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
-    echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk
-  else
-    echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk
+  # Only ROCm 5.2+ is supported.
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then
+    echo "\
+WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required."
+    exit 1;
   fi
   
   # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
@@ -309,17 +308,10 @@ function configure_rocm {
     cat makefiles/hip_64bit.mk >> kaldi.mk
   else
     echo "\
-WARNING: ROCM will not be used!
-         ROCM is only supported with 64-bit Linux builds."
+WARNING: ROCm will not be used!
+         ROCm is only supported with 64-bit Linux builds."
     exit 1;
   fi
-  
-  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then
-    echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk
-    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk
-  else
-    echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk
-  fi
 }
 
 
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index d4eda345564..a7972f1831d 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -41,21 +41,9 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
 endif
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
 endif
-endif
 
 else
 all:
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 06dceae73a5..9baa274e2ea 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -199,7 +199,7 @@ void CudaDecoder::AllocateHostData() {
       (void**)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_infotoken_concat_,
+	    (void**)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
       cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_,
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index c0f54a854e8..d7739dae623 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -44,21 +44,9 @@ ifeq ($(CUDA), true)
 	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
 endif
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
 endif
-endif
 else
 all:
 		$(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]")
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 68c247b43e9..1699f8c1e77 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -382,43 +382,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
 
 #if CUDA_VERSION >= 9010
   int nrhs = 1;
-
-#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
-  // query temp buffer size
-  int L_work;
-
-  // perform factorization in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize(
-        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
-        ivector_dim_, &L_work, num_lanes));
-  // allocate temp buffer
-  float *workspace = static_cast<float *>(
-          CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
-
-  // perform factorization in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched(
-        GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
-        ivector_dim_, workspace, L_work, d_infoArray_, num_lanes));
-
-  int L_work2;
-
-  // perform factorization in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize(
-		  GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
-		  quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes));
-  // allocate temp buffer
-  float *workspace2 = static_cast<float *>(
-            CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
-
-  // solve for rhs in batched
-  CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched(
-      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
-      quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_,
-      num_lanes));
-
-  CuDevice::Instantiate().Free(workspace);
-  CuDevice::Instantiate().Free(workspace2);
-#else
   // perform factorization in batched
   CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_,
@@ -429,7 +392,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs,
       quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_,
       num_lanes));
-#endif
 #endif
 
   // cusolver solves in place.  Ivectors are now in linear_
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index 202232c6b23..113657ce317 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -20,11 +20,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipfft.h>
-#else
 #include <hipfft/hipfft.h>
-#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index f8947a3b5ed..bb78028118f 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -27,6 +27,7 @@
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 
+// HIP builds do not required packed floating point operators definition.
 #ifndef __IS_HIP_COMPILE__
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   float2 retval;
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 66f0dce395a..5625592a717 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -20,11 +20,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipfft.h>
-#else
 #include <hipfft/hipfft.h>
-#endif
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index 56dbac93165..fa0e9f68237 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -299,14 +299,13 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
   // Forming new non-SP matrix for cusolver.
   CuMatrix<float> A(quadratic);
 
-
-
 #ifdef CHOLESKY
   // query temp buffer size
   int L_work;
   CUSOLVER_SAFE_CALL(
       cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER,
                                   A.NumRows(), A.Data(), A.Stride(), &L_work));
+
   // allocate temp buffer
   float *workspace = static_cast<float *>(
       CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
@@ -317,26 +316,9 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats(
       A.Stride(), workspace, L_work, d_info_));
 
   // solve for rhs
-#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2)
-  // query temp buffer size
-  int L_work2;
-  CUSOLVER_SAFE_CALL(
-	   hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
-			                      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2));
-  // allocate temp buffer
-  float *workspace2 = static_cast<float *>(
-      CuDevice::Instantiate().Malloc(L_work2 * sizeof(float)));
-
-  CUSOLVER_SAFE_CALL(hipsolverSpotrs(
-      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
-      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_));
-
-  CuDevice::Instantiate().Free(workspace2);
-#else
   CUSOLVER_SAFE_CALL(cusolverDnSpotrs(
       GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
       A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_));
-#endif
 
   CuDevice::Instantiate().Free(workspace);
 #else
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 3c1100753e5..45c10b78899 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -34,20 +34,8 @@ ifeq ($(CUDA), true)
 endif
 
 ifeq ($(ROCM), true)
-ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true)
-.PRECIOUS: %.hip
-%.hip : %.cu
-	LA='[^\(,]+\([^\)]+\)|[^,]+' ; \
-	cat $< | \
-	sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \
-	sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \
-	cat > $@
-%.o : %.hip
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-else
 %.o : %.cu
 	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
 endif
-endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index d81dca002ce..abd08a9b015 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -25,11 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index f776bbb620e..1ed7e54b541 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,11 +24,7 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 7983cd250e7..fd17fe61893 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index c4bdf569d3c..41ef7536a7f 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,13 +32,8 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#include <hipsparse.h>
-#else
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
-#endif
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
 #include <roctracer/roctx.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index 442d2dbac67..e42c93f1b67 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 25775fb1b05..4d0be20ddc3 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -21,13 +21,10 @@
 // limitations under the License.
 
 
+
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include "hipify.h"
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 67b9f1d9e9b..bb1170314c4 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,13 +29,8 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#include <hipsparse.h>
-#else
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <hiprand/hiprand.h>
@@ -49,11 +44,7 @@
 #endif
 #if CUDA_VERSION >= 9010
 #ifdef __IS_HIP_COMPILE__
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipsolver.h>
-#else
 #include <hipsolver/hipsolver.h>
-#endif
 #else
 #include <cusolverDn.h>
 #endif
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 349b21b6591..3d7fae5c15e 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -27,15 +27,18 @@
 
 #include <cfloat>
 #include <limits>
-#include <math_constants.h>
 #ifdef __IS_HIP_COMPILE__
 #define __CUDA_ARCH__ 800
+#include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 #include "hipify.h"
+#define CUDART_INF HIP_INF
+#define CUDART_INF_F HIP_INF_F
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <hipcub/hipcub.hpp>
 #include <hipcub/block/block_reduce.hpp>
 #else
+#include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
@@ -2048,27 +2051,9 @@ static void _transform_reduce_mat_cols(
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
   }
-
-  // if (tid == 0) {
-  //   for (int j = 0; j < d.cols; j += 1)
-  //     tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));
-  //   result[i] = tdata;
-    
-  // }
-  // return;
-
   sdata[tid] = tdata;
   __syncthreads();
 
-  // if (tid == 0) {
-  //   tdata = 0;
-  //   for (int j = 0; j < CU1DBLOCK; j += 1)
-  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
-  //   result[i] = tdata;
-  // }
-
-  // return;
-
   // Tree reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -2077,16 +2062,6 @@ static void _transform_reduce_mat_cols(
     __syncthreads();
   }
 
-  // if (tid == 0) {
-  //   tdata = 0;
-  //   for (int j = 0; j < 2*warpSize; j += 1)
-  //     tdata = op.Reduce(tdata, op.Transform(sdata[j]));
-  //   result[i] = tdata;
-  // }
-
-  // return;
-
-
   // Reduce last warp. Threads implicitly synchronized within a warp.
   for (int shift = warpSize; shift > 0; shift >>= 1) {
     if (tid < warpSize) {
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 1245fb28bad..e1d59e777be 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() {
   for (int32 loop = 0; loop < 10; loop++) {
 
     // problem dimensions.
-    int32 num_rows = RandInt(5, 20), //16
-          cell_dim = RandInt(2, 200), //45
-        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3
+    int32 num_rows = RandInt(5, 20),
+          cell_dim = RandInt(2, 200),
+        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
 
     // Pick the (input or params block), and output block, for which we'll
     // spot-check the derivative values.  This will give us test failures
@@ -232,6 +232,7 @@ void UnitTestLstmNonlinearity() {
     else
       test_params = -1;
 
+
     CuMatrix<BaseFloat> input(num_rows, cell_dim * 5 + dropout_dim),
         params(3, cell_dim),
         output_deriv(num_rows, cell_dim * 2);
@@ -276,11 +277,11 @@ void UnitTestLstmNonlinearity() {
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim + dropout_dim),
           delta_params(3, cell_dim);
-      if (test_input >= 0) { // -1
+      if (test_input >= 0) {
         delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
         delta_input.Scale(delta);
       }
-      if (test_params >= 0) { // 0
+      if (test_params >= 0) {
         delta_params.Row(test_params).SetRandn();
         delta_params.Scale(delta);
       }
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 26a5281ec05..ecddd24db19 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2675,19 +2675,11 @@ static void UnitTestCuMatrixSetRandn() {
 
 template <typename Real>
 static void UnitTestCuMatrixSetRandUniform() {
-
-  // if (CuDevice::Instantiate().Enabled()) {
-  //   CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456));
-  // }
-
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200;
+    MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200;
     CuMatrix<Real> M(rows, cols);
     M.SetRandUniform();
-    // M.SetZero();
-    // M.Add(0.5);
-    // M.SetZeroAboveDiag();
-
+    
     M.Add(-0.5); // we'll be testing the central moments, so
     // center it around zero first.
     // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html
@@ -2701,16 +2693,6 @@ static void UnitTestCuMatrixSetRandUniform() {
     for (int32 pow = 1; pow < central_moments.Dim(); pow++) {
       CuMatrix<Real> Mpow(M);
       Mpow.ApplyPow(pow);
-
-      // if (CuDevice::Instantiate().Enabled()) {
-      //   CuVector<Real> col_sum(rows, kUndefined);
-      //   cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim());
-      //   KALDI_LOG << "Sums vector is " << col_sum;
-      //   Real ans = col_sum.Sum();
-      //   KALDI_LOG << "Total sum is " << ans;
-      //   KALDI_ERR << "Stopping!";
-      // }
-
       Real observed_moment = Mpow.Sum() / (rows * cols);
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
@@ -2723,13 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
-        //KALDI_LOG << "Random vector sum is " << col_sum;
-        KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
+                KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
-      KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")";
-    }
+          }
   }
 }
 
@@ -3081,7 +3061,7 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   SetVerboseLevel(1);
   int32 loop = 0;
-  bool test_threads = false;
+  bool test_threads = true;
   // num_threads only matters if test_threads == true.   Don't make it
   // to large, because it will affect CPU usage if you are using CPU.
   int32 num_threads = 4;
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 56acf340823..fd31758f0e6 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -29,11 +29,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 4de0fcba63d..c9d686d0ce8 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -23,11 +23,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index 86a3cd9a726..a6c7d7720e4 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 35ba3ee0c81..cda575b1914 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -24,11 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 739bab3dd59..378cc8e4e38 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -21,11 +21,7 @@
 #if HAVE_CUDA==1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index f6426297e49..c88b3ebf50c 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -24,11 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
-#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2
-#include <hipblas.h>
-#else
 #include <hipblas/hipblas.h>
-#endif
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index 56d7e869a32..efe4848c009 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -211,7 +211,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
 #define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
 #define curandSetStream                           hiprandSetStream
 #define curandStatus_t                            hiprandStatus_t
-#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5
 #define cusolverDnCreate                          hipsolverDnCreate
 #define cusolverDnDestroy                         hipsolverDnDestroy
 #define cusolverDnHandle_t                        hipsolverDnHandle_t
@@ -221,17 +220,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
 #define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
 #define cusolverDnSpotrs                          hipsolverDnSpotrs
 #define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
-#else
-#define cusolverDnCreate                          hipsolverCreate
-#define cusolverDnDestroy                         hipsolverDestroy
-#define cusolverDnHandle_t                        hipsolverHandle_t
-#define cusolverDnSetStream                       hipsolverSetStream
-#define cusolverDnSpotrf                          hipsolverSpotrf
-#define cusolverDnSpotrfBatched                   hipsolverSpotrfBatched
-#define cusolverDnSpotrf_bufferSize               hipsolverSpotrf_bufferSize
-#define cusolverDnSpotrs                          hipsolverSpotrs
-#define cusolverDnSpotrsBatched                   hipsolverSpotrsBatched
-#endif
 #define cusparseAction_t                          hipsparseAction_t
 #define cusparseCreate                            hipsparseCreate
 #define cusparseCreateCsr                         hipsparseCreateCsr
diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h
deleted file mode 100644
index 7fb8fce8e71..00000000000
--- a/src/hip/math_constants.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__MATH_CONSTANTS_H__)
-#define __MATH_CONSTANTS_H__
-
-/* single precision constants */
-#define CUDART_INF_F            __int_as_float(0x7f800000)
-#define CUDART_NAN_F            __int_as_float(0x7fffffff)
-#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001)
-#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffff)
-#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000)
-#define CUDART_ZERO_F           0.0f
-#define CUDART_ONE_F            1.0f
-#define CUDART_SQRT_HALF_F      0.707106781f
-#define CUDART_SQRT_HALF_HI_F   0.707106781f
-#define CUDART_SQRT_HALF_LO_F   1.210161749e-08f
-#define CUDART_SQRT_TWO_F       1.414213562f
-#define CUDART_THIRD_F          0.333333333f
-#define CUDART_PIO4_F           0.785398163f
-#define CUDART_PIO2_F           1.570796327f
-#define CUDART_3PIO4_F          2.356194490f
-#define CUDART_2_OVER_PI_F      0.636619772f
-#define CUDART_SQRT_2_OVER_PI_F 0.797884561f
-#define CUDART_PI_F             3.141592654f
-#define CUDART_L2E_F            1.442695041f
-#define CUDART_L2T_F            3.321928094f
-#define CUDART_LG2_F            0.301029996f
-#define CUDART_LGE_F            0.434294482f
-#define CUDART_LN2_F            0.693147181f
-#define CUDART_LNT_F            2.302585093f 
-#define CUDART_LNPI_F           1.144729886f
-#define CUDART_TWO_TO_M126_F    1.175494351e-38f
-#define CUDART_TWO_TO_126_F     8.507059173e37f
-#define CUDART_NORM_HUGE_F      3.402823466e38f
-#define CUDART_TWO_TO_23_F      8388608.0f
-#define CUDART_TWO_TO_24_F      16777216.0f
-#define CUDART_TWO_TO_31_F      2147483648.0f
-#define CUDART_TWO_TO_32_F      4294967296.0f
-#define CUDART_REMQUO_BITS_F    3
-#define CUDART_REMQUO_MASK_F    (~((~0)<<CUDART_REMQUO_BITS_F))
-#define CUDART_TRIG_PLOSS_F     105615.0f
-
-/* double precision constants */
-#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
-#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
-#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
-#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
-#define CUDART_ZERO             0.0
-#define CUDART_ONE              1.0
-#define CUDART_SQRT_TWO         1.4142135623730951e+0
-#define CUDART_SQRT_HALF        7.0710678118654757e-1
-#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
-#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
-#define CUDART_THIRD            3.3333333333333333e-1
-#define CUDART_TWOTHIRD         6.6666666666666667e-1
-#define CUDART_PIO4             7.8539816339744828e-1
-#define CUDART_PIO4_HI          7.8539816339744828e-1
-#define CUDART_PIO4_LO          3.0616169978683830e-17
-#define CUDART_PIO2             1.5707963267948966e+0
-#define CUDART_PIO2_HI          1.5707963267948966e+0
-#define CUDART_PIO2_LO          6.1232339957367660e-17
-#define CUDART_3PIO4            2.3561944901923448e+0
-#define CUDART_2_OVER_PI        6.3661977236758138e-1
-#define CUDART_PI               3.1415926535897931e+0
-#define CUDART_PI_HI            3.1415926535897931e+0
-#define CUDART_PI_LO            1.2246467991473532e-16
-#define CUDART_SQRT_2PI         2.5066282746310007e+0
-#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
-#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
-#define CUDART_SQRT_PIO2        1.2533141373155003e+0
-#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
-#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
-#define CUDART_SQRT_2OPI        7.9788456080286536e-1
-#define CUDART_L2E              1.4426950408889634e+0
-#define CUDART_L2E_HI           1.4426950408889634e+0
-#define CUDART_L2E_LO           2.0355273740931033e-17
-#define CUDART_L2T              3.3219280948873622e+0
-#define CUDART_LG2              3.0102999566398120e-1
-#define CUDART_LG2_HI           3.0102999566398120e-1
-#define CUDART_LG2_LO         (-2.8037281277851704e-18)
-#define CUDART_LGE              4.3429448190325182e-1
-#define CUDART_LGE_HI           4.3429448190325182e-1
-#define CUDART_LGE_LO           1.09831965021676510e-17
-#define CUDART_LN2              6.9314718055994529e-1
-#define CUDART_LN2_HI           6.9314718055994529e-1
-#define CUDART_LN2_LO           2.3190468138462996e-17
-#define CUDART_LNT              2.3025850929940459e+0
-#define CUDART_LNT_HI           2.3025850929940459e+0
-#define CUDART_LNT_LO         (-2.1707562233822494e-16)
-#define CUDART_LNPI             1.1447298858494002e+0
-#define CUDART_LN2_X_1024       7.0978271289338397e+2
-#define CUDART_LN2_X_1025       7.1047586007394398e+2
-#define CUDART_LN2_X_1075       7.4513321910194122e+2
-#define CUDART_LG2_X_1024       3.0825471555991675e+2
-#define CUDART_LG2_X_1075       3.2360724533877976e+2
-#define CUDART_TWO_TO_23        8388608.0
-#define CUDART_TWO_TO_52        4503599627370496.0
-#define CUDART_TWO_TO_53        9007199254740992.0
-#define CUDART_TWO_TO_54        18014398509481984.0
-#define CUDART_TWO_TO_M54       5.5511151231257827e-17
-#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
-#define CUDART_TRIG_PLOSS       2147483648.0
-#define CUDART_DBL2INT_CVT      6755399441055744.0
-
-#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
index aec3e359f53..3d9b87dcc03 100644
--- a/src/makefiles/hip_64bit.mk
+++ b/src/makefiles/hip_64bit.mk
@@ -11,6 +11,7 @@ endif
 # Specific HIP/ROCm components should be included prior to the generic include to avoid
 # deprecation warnings.
 CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+            -D__HIP_PLATFORM_AMD__=1 \
             -D__IS_HIP_COMPILE__=1 \
             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
             -DCUDA_VERSION=11000 \
@@ -35,9 +36,11 @@ ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
 #       CUDA is active and everything is device compiles.
 ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
              -D__IS_HIP_COMPILE__=1 \
+             -D__HIP_PLATFORM_AMD__=1 \
              -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
              -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
 	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics  \
+             -fgpu-default-stream=per-thread \
              $(EXTRA_ROCM_FLAGS)
              
 

From 0018fecdd8438c2d2fa7db23c94a045f210efc3c Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Mon, 6 Nov 2023 23:59:05 +0000
Subject: [PATCH 44/76] Fix formating to Google style.

---
 .../batched-static-nnet3-kernels.h            |   1 +
 ...hed-threaded-nnet3-cuda-online-pipeline.cc |   1 +
 .../batched-threaded-nnet3-cuda-pipeline.cc   |   1 +
 .../batched-threaded-nnet3-cuda-pipeline2.cc  |   1 +
 src/cudadecoder/cuda-decoder-kernels.cu       |   3 +-
 src/cudadecoder/cuda-decoder.cc               |  22 +-
 src/cudadecoder/cuda-decoder.h                |   1 +
 src/cudadecoder/cuda-fst.cc                   |   1 +
 .../batched-wav-nnet3-cuda-online.cc          |   2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc  |   2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda2.cc |   1 +
 ...eature-online-batched-cmvn-cuda-kernels.cu |   1 +
 ...ure-online-batched-ivector-cuda-kernels.cu |  27 +-
 ...re-online-batched-spectral-cuda-kernels.cu |   4 +-
 .../feature-online-batched-spectral-cuda.h    |   1 +
 src/cudafeat/feature-online-cmvn-cuda.cu      |   4 +-
 src/cudafeat/feature-spectral-cuda.cu         |   2 +
 src/cudafeat/feature-spectral-cuda.h          |   1 +
 src/cudafeat/feature-window-cuda.cu           |   1 +
 .../online-batched-feature-pipeline-cuda.cc   |   4 +-
 .../online-ivector-feature-cuda-kernels.cu    |  28 +-
 src/cudafeat/online-ivector-feature-cuda.cc   |   1 +
 src/cudamatrix/cu-allocator.cc                |   2 +-
 src/cudamatrix/cu-allocator.h                 |   3 +-
 src/cudamatrix/cu-array-inl.h                 |   1 +
 src/cudamatrix/cu-array.cc                    |   1 +
 src/cudamatrix/cu-block-matrix.cc             |   1 +
 src/cudamatrix/cu-common.cc                   | 121 +++--
 src/cudamatrix/cu-common.h                    |   5 +-
 src/cudamatrix/cu-compressed-matrix.cc        |   1 +
 src/cudamatrix/cu-device.cc                   |  11 +-
 src/cudamatrix/cu-device.h                    |   5 +-
 src/cudamatrix/cu-kernels.cu                  | 107 ++--
 src/cudamatrix/cu-matrix-test.cc              |   6 +-
 src/cudamatrix/cu-matrix.cc                   |   1 +
 src/cudamatrix/cu-packed-matrix.cc            |   1 +
 src/cudamatrix/cu-sp-matrix.cc                |   1 +
 src/cudamatrix/cu-sparse-matrix.cc            |   1 +
 src/cudamatrix/cu-tp-matrix.cc                |   1 +
 src/cudamatrix/cu-vector.cc                   |  16 +-
 src/cudamatrix/cublas-wrappers.h              |  13 +-
 src/hip/hipify.h                              | 488 +++++++++---------
 42 files changed, 512 insertions(+), 384 deletions(-)

diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
index 0bcb1997576..fec2470a9db 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.h
+++ b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -19,6 +19,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index c7012b686e0..ed0c0a2f5e9 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -23,6 +23,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index d5cf7dae2d7..23d0ca283a2 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -28,6 +28,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index f6a3455db01..01d6b1165e7 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -25,6 +25,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index 6a14371911d..8503182c1f8 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -16,8 +16,9 @@
 // limitations under the License.
 
 #ifdef __IS_HIP_COMPILE__
-#include "float.h"
 #include <hipcub/hipcub.hpp>
+
+#include "float.h"
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 9baa274e2ea..056d563a791 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -40,6 +40,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
@@ -190,35 +191,36 @@ void CudaDecoder::AllocateDeviceData() {
 void CudaDecoder::AllocateHostData() {
   channel_to_compute_.resize(nlanes_);
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void**)&h_extra_and_acoustic_cost_concat_,
+      (void **)&h_extra_and_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void**)&h_acoustic_cost_concat_,
+      (void **)&h_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void**)&h_extra_prev_tokens_concat_,
+      (void **)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	    (void**)&h_infotoken_concat_,
+      (void **)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_,
+      cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_,
                      nlanes_ * main_q_capacity_ *
                          sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_acoustic_cost_concat_tmp_,
+      (void **)&h_acoustic_cost_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_extra_prev_tokens_concat_tmp_,
+      (void **)&h_extra_prev_tokens_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_infotoken_concat_tmp_,
+      (void **)&h_infotoken_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
   h_lanes_counters_.Resize(
       nlanes_ + 1,
       1);  // +1 because we sometimes need last+1 value (for offsets)
-  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-	  (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost((void **)&h_channels_counters_,
+                     nchannels_ * sizeof(*h_channels_counters_)));
 
   h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
   h_all_tokens_acoustic_cost_.resize(nchannels_);
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index 510904aa004..f6ee37512e2 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -22,6 +22,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 3af37eb7676..682485f6ce4 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -25,6 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index 56368853df2..2bc0a483a0f 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -25,8 +25,8 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include "hip/hip_runtime.h"
-#include "roctracer/roctx.h"
 #include "hipify.h"
+#include "roctracer/roctx.h"
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 05af50d7a3b..0e4a719bc75 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -19,8 +19,8 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include "hip/hip_runtime.h"
-#include "roctracer/roctx.h"
 #include "hipify.h"
+#include "roctracer/roctx.h"
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index c14571f2ed9..b2ad9254c67 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -22,6 +22,7 @@
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cuda.h>
diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index 7a521d43693..1df9c6a7a43 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -17,6 +17,7 @@
 //
 #ifdef __IS_HIP_COMPILE__
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index e5b89d163e5..da2ba24bd90 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -18,6 +18,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
@@ -102,8 +103,9 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss,
                              int32_t right, const LaneDesc *lanes,
                              int32_t num_lanes) {
   dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, 
-              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
+  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) /
+                  GPU_MAX_WARPS_PER_BLOCK,
               num_lanes);
 
   zero_invalid_posteriors_kernel<<<blocks, threads>>>(
@@ -217,8 +219,10 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
                              int32_t stridest, float *spliced_feats,
                              int32_t lds, int32_t strides,
                              const LaneDesc *lanes, int32_t num_lanes) {
-  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK)
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
@@ -314,8 +318,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
     // is less than stash size
 
     KALDI_ASSERT(stash_size <= GPU_WARP_SIZE);
-    // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads()
-    // to avoid read/write hazards when reading/writing in-place
+    // This only works if stash size is <= GPU_WARP_SIZE as we rely on
+    // __syncthreads() to avoid read/write hazards when reading/writing in-place
     dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
     dim3 blocks(num_lanes);
 
@@ -325,9 +329,11 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
   }
 
   {
-    int threads =
-        (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;       // round up to the nearest warp size
-    if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
+    int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                  GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+    if (threads > GPU_MAX_THREADS_PER_BLOCK)
+      threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                            // GPU_MAX_THREADS_PER_BLOCK threads
     dim3 blocks(stash_size, num_lanes);
 
     // Then we need to copy feats from source into stash
@@ -510,7 +516,8 @@ void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A,
                                  int32_t lda, int32_t stridea,
                                  const LaneDesc *lanes, int32_t num_lanes) {
   dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
+  int block =
+      (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
   dim3 blocks(block, block, num_lanes);
 
   batched_convert_sp_to_dense_kernel<<<blocks, threads>>>(
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index 27375f4914e..856d2acab81 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -18,8 +18,10 @@
 #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h"
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipcub/hipcub.hpp>
 #include <roctracer/roctx.h>
+
+#include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index 113657ce317..d18f5237e8f 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -21,6 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hipfft/hipfft.h>
+
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index bb78028118f..e432fe56573 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -18,6 +18,7 @@
 #ifdef __IS_HIP_COMPILE__
 #define __CUDA_ARCH__ 800
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
@@ -189,7 +190,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
       stats.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 
-  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
+  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+            GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
   if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;
 
   const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index 9c0d5df5288..d8fc215b80b 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -19,7 +19,9 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 5625592a717..b0e4a24c8d2 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -21,6 +21,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hipfft/hipfft.h>
+
 #include "hipify.h"
 #else
 #include <cufft.h>
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index 6ba45e682c1..60fe113d402 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -19,6 +19,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index 650b51ec3c7..7736f525237 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -22,6 +22,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <nvToolsExt.h>
@@ -100,7 +101,8 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   current_samples_stash_ = new int32_t[num_channels_];
 
   // allocated pinned memory for storing channel desc
-  CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
+  CU_SAFE_CALL(
+      cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
 
   // allocate device memory
   lanes_ =
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index dffc9fd3c8f..b7128dec7e6 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -17,6 +17,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hipcub/hipcub.hpp>
+
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
@@ -34,9 +35,12 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
   // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
-  __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
+  __shared__
+      typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
 
-  __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
+  __shared__ float
+      s_A[GPU_MAX_WARPS_PER_BLOCK]
+         [GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
 
   int bid = blockIdx.x;   // batch id
   int tid = threadIdx.x;  // thread id
@@ -47,13 +51,15 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Offset to input vector to starting column for batch
   const float* __restrict__ X_in = X + bid * ldx;
 
-  for (int i = 0; i < cols; i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
+  for (int i = 0; i < cols;
+       i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
     int c = i + tid;
 
     float sum = 0.0f;
     // Perform dot product
     for (int j = 0; j < rows;
-         j += GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
+         j +=
+         GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
       int r = j + wid;
 
       float val = 0.0f;
@@ -139,7 +145,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
                                                     int32_t lda, float scale,
                                                     float* retval) {
   // Specialize WarpReduce for type float
-  typedef cub::BlockReduce<float, GPU_WARP_SIZE, cub::BLOCK_REDUCE_WARP_REDUCTIONS, GPU_MAX_WARPS_PER_BLOCK>
+  typedef cub::BlockReduce<float, GPU_WARP_SIZE,
+                           cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                           GPU_MAX_WARPS_PER_BLOCK>
       BlockReduce;
   // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -207,7 +215,8 @@ __global__ void update_linear_and_quadratic_terms_kernel(
 void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
                          const float* AT, int B_stride, const float* B,
                          float* C) {
-  batched_gemv_reduce_kernel<<<batch_size, dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
+  batched_gemv_reduce_kernel<<<batch_size,
+                               dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
       rows, cols, AT, A_stride, B, B_stride, C);
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -215,8 +224,11 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
 void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
                      int32_t size, const float* feats, int32_t ldf,
                      float* sfeats, int32_t lds) {
-  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;       // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK)
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                          // GPU_MAX_THREADS_PER_BLOCK threads
 
   splice_features_kernel<<<num_frames, threads>>>(
       num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index fa0e9f68237..f96b2a81ce2 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -18,6 +18,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 // The BLAS enumerators are used instead of the SOLVER ones.
 #ifdef CUBLAS_FILL_MODE_LOWER
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index abd08a9b015..c4cceedca48 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -26,6 +26,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
@@ -33,7 +34,6 @@
 #include <cuda_runtime_api.h>
 #endif
 
-
 #include <string>
 #include <vector>
 #include <algorithm>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 1ed7e54b541..3edd9f1ca40 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -24,9 +24,10 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 1fd80502cf9..b8c250c6771 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -30,6 +30,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 333e8fbed1c..2a29338aeb1 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -24,6 +24,7 @@
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fd17fe61893..63cf33f98b2 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 2e77062f20d..938ec679f68 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -24,6 +24,7 @@
 
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
+
 #include "hipify.h"
 #define API_NAME_PREFIX "HIP"
 #else
@@ -59,7 +60,7 @@ NvtxTracer::~NvtxTracer() {
 #ifdef __IS_HIP_COMPILE__
   roctxRangePop();
 #else
-  nvtxRangePop();
+        nvtxRangePop();
 #endif
 }
 #endif
@@ -102,19 +103,31 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
 const char* cublasGetStatusStringK(cublasStatus_t status) {
   // Defined in CUDA include file: cublas.h or cublas_api.h
   switch(status) {
-    case CUBLAS_STATUS_SUCCESS:           return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:   return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:     return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:     return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:  return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:    return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:     return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:     return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
+    case CUBLAS_STATUS_SUCCESS:
+      return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
 #ifdef __IS_HIP_COMPILE__
-    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
-    case HIPBLAS_STATUS_INVALID_ENUM:     return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:
+      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_INVALID_ENUM:
+      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
 #endif
   }
   return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR";
@@ -124,20 +137,32 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust
   // Defined in CUDA include file: cusparse.h
   switch(status) {
-    case CUSPARSE_STATUS_SUCCESS:                   return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED:           return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED:              return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE:             return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH:             return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR:             return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED:          return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR:            return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT:                return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
-    #if CUDA_VERSION >= 11000
-    case CUSPARSE_STATUS_NOT_SUPPORTED:             return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
-    #endif
+    case CUSPARSE_STATUS_SUCCESS:
+      return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_ZERO_PIVOT:
+      return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
+#if CUDA_VERSION >= 11000
+    case CUSPARSE_STATUS_NOT_SUPPORTED:
+      return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:
+      return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
+#endif
   }
   return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR";
 }
@@ -146,21 +171,35 @@ const char* curandGetStatusString(curandStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
   // Defined in CUDA include file: curand.h
   switch(status) {
-    case CURAND_STATUS_SUCCESS:                     return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:            return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:             return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:           return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:                  return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:                return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:              return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:         return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:       return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:               return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:              return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
+    case CURAND_STATUS_SUCCESS:
+      return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
 #ifdef __IS_HIP_COMPILE__
-    case HIPRAND_STATUS_NOT_IMPLEMENTED:            return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
+    case HIPRAND_STATUS_NOT_IMPLEMENTED:
+      return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
 #endif
   }
   return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 41ef7536a7f..934668da6f2 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,11 +32,12 @@
 #if HAVE_CUDA
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
-#include <hipsparse/hipsparse.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include <roctracer/roctx.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index e42c93f1b67..bb4017de9bb 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 4d0be20ddc3..fd2c0c64f1f 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -24,15 +24,16 @@
 
 #if HAVE_CUDA == 1
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#endif // __IS_HIP_COMPILE__
+#endif  // __IS_HIP_COMPILE__
 #include <string>
 #include <vector>
 #include <algorithm>
@@ -246,12 +247,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Suggest to use compute exclusive mode
-  #ifdef __IS_HIP_COMPILE__
+#ifdef __IS_HIP_COMPILE__
     KALDI_WARN << "Not in compute-exclusive mode.";
-  #else 
+#else
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
-  #endif
+#endif
     // We want to choose the device more carefully, so release the CUDA context.
     e = cudaDeviceReset();
     if (e != cudaSuccess) {
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index bb1170314c4..fe8ac795560 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -29,11 +29,12 @@
 #include <iostream>
 
 #ifdef __IS_HIP_COMPILE__
-#include <hipblas/hipblas.h>
-#include <hipsparse/hipsparse.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+
 #include "hipify.h"
 #else
 #include <cublas_v2.h>
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 3d7fae5c15e..8d5784acb52 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -31,18 +31,18 @@
 #define __CUDA_ARCH__ 800
 #include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
+
 #include "hipify.h"
-#define CUDART_INF HIP_INF
-#define CUDART_INF_F HIP_INF_F
-#include "cudamatrix/cu-kernels-ansi.h"
-#include <hipcub/hipcub.hpp>
 #include <hipcub/block/block_reduce.hpp>
+#include <hipcub/hipcub.hpp>
+
+#include "cudamatrix/cu-kernels-ansi.h"
 #else
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
-#endif //__IS_HIP_COMPILE__
+#endif            //__IS_HIP_COMPILE__
 
 /***********************************************************************
  * Generic __device__ functions
@@ -1122,7 +1122,7 @@ void trace_mat_mat_trans_atomic(Real *d_result,
   // Assuming *d_result is set to zero already
 
   constexpr int THREADS_X = GPU_WARP_SIZE;
-  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2;
+  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2;
 
   dim3 thrds(THREADS_X, THREADS_Y);
 
@@ -2111,7 +2111,7 @@ static void _group_transform_reduce(
 
     // tree-reduce to 2x warpSize elements per group
     int shift = threads_per_group / 2;
-#   pragma unroll
+#pragma unroll
     for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
@@ -4009,9 +4009,9 @@ struct  BatchedMatrixCopyDesc {
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
-// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA)
-// grid dim x,y expands to fill out average in x/y across batches
-// grid dim.z is batch
+// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE
+// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y
+// expands to fill out average in x/y across batches grid dim.z is batch
 template<typename Real>
 __global__ 
 void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
@@ -4390,7 +4390,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B,
 
 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
                          MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
 }
 
 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
@@ -4413,8 +4413,8 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
                                            v, stride_v);
 #ifdef __IS_HIP_COMPILE__
   } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
-                                           v, stride_v);
+    _add_diag_mat_mat_MTN<64>
+        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
 #endif
   }
 }
@@ -4426,10 +4426,10 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x==64) {
-    _add_diag_mat_mat_MN<64> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #endif
   }
 }
@@ -5105,7 +5105,7 @@ void cudaD_trace_mat_mat_trans(const double* A,
 
 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
                          MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<GPU_WARP_SIZE> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
 }
 
 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
@@ -5128,8 +5128,8 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
                                            v, stride_v);
 #ifdef __IS_HIP_COMPILE__
   } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MTN<64> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
-                                           v, stride_v);
+    _add_diag_mat_mat_MTN<64>
+        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
 #endif
   }
 }
@@ -5141,10 +5141,10 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x==64) {
-    _add_diag_mat_mat_MN<64> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
 #endif
   }
 }
@@ -5516,25 +5516,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
@@ -5831,8 +5831,9 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 // this will synchronize all threads without blocking.
 void cuda_legacy_noop() {
 #ifdef __IS_HIP_COMPILE__
-  // HIP doesn't currently support cudaStreamLegacy stream so we force the implementation to use the
-  // legacy (not per-thread) API to get similar semantics.
+  // HIP doesn't currently support cudaStreamLegacy stream so we force the
+  // implementation to use the legacy (not per-thread) API to get similar
+  // semantics.
   auto k = reinterpret_cast<void*>(_noop_kernel);
   hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0);
 #else
@@ -5847,8 +5848,10 @@ void cudaF_mat_copy_range_clamped(
    float *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks(
+      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5861,8 +5864,10 @@ void cudaD_mat_copy_range_clamped(
    double *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks(
+      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5871,8 +5876,7 @@ void cudaD_mat_copy_range_clamped(
 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
     int32_t *ldo) {
-
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<float> batch_desc; 
@@ -5898,9 +5902,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  MAX_BATCH_SIZE);
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5920,10 +5925,11 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
-      
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  remaining);
+
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5936,8 +5942,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
     int32_t *ldo) {
-
-  dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<double> batch_desc; 
@@ -5963,9 +5968,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  MAX_BATCH_SIZE);
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5986,10 +5992,11 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
 
-      dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-                  (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, 
-                  remaining);
-      
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          remaining);
+
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
 
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index ecddd24db19..dfcaf30770a 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2679,7 +2679,7 @@ static void UnitTestCuMatrixSetRandUniform() {
     MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200;
     CuMatrix<Real> M(rows, cols);
     M.SetRandUniform();
-    
+
     M.Add(-0.5); // we'll be testing the central moments, so
     // center it around zero first.
     // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html
@@ -2705,11 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
-                KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
+        KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
-          }
+    }
   }
 }
 
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index fd31758f0e6..53831a52bc8 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -30,6 +30,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index c9d686d0ce8..001170fdeca 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -24,6 +24,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index a6c7d7720e4..96085848d72 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index cda575b1914..81ecbe68080 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -25,6 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 378cc8e4e38..da19a31b39a 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -22,6 +22,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index c88b3ebf50c..6667f2bca62 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -25,6 +25,7 @@
 #ifdef __IS_HIP_COMPILE__
 #include <hip/hip_runtime_api.h>
 #include <hipblas/hipblas.h>
+
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
@@ -649,8 +650,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         const int32 warpSize = GPU_WARP_SIZE;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
-            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
-                GPU_WARP_SIZE/2 : GPU_WARP_SIZE;
+            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize)
+                ? GPU_WARP_SIZE / 2
+                : GPU_WARP_SIZE;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
@@ -676,8 +678,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access.
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
-        int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
+        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
+                           ? GPU_WARP_SIZE
+                           : GPU_WARP_SIZE / 2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
@@ -685,8 +688,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
       } else {
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
-        int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2;
+        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
+                           ? GPU_WARP_SIZE
+                           : GPU_WARP_SIZE / 2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index dc5c0e0ced5..537cca9b97f 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -37,8 +37,9 @@ inline cublasStatus_t cublas_gemm(
     const float *A, int lda, const float *B, int ldb, float beta,
     float *C, int ldc) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta,
-                      C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
+  return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F,
+                      lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc,
+                      CuDevice::Instantiate().GetCublasComputeType(),
                       CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
@@ -66,9 +67,11 @@ inline cublasStatus_t cublas_gemmBatched(
     const float *A[], int lda, const float *B[], int ldb, float beta,
     float *C[], int ldc, int batchCount) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F,  lda,
-                             (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount,
-                             CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo());
+  return cublasGemmBatchedEx(
+      handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F,
+      lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F,
+      ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(),
+      CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount);
 #endif
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index efe4848c009..e9ca483d022 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -2,250 +2,262 @@
 #define __HIPIFY_H__
 
 #ifdef __HIPCC__
-inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
-    // On CDNA hardware wave-fronts (warps) execute always in
-    // lock step. Though it might still be important to signal
-    // that the compiler can't reorder code around certain code 
-    // sections that rely on data sharing mecanisms like LDS 
-    // (shared memory). So this implements a No-op but is seen
-    // by the compiler as having side effects. 
-    __asm__("s_nop 0");
+inline __device__ void __syncwarp(unsigned mask = 0xffffffff) {
+  // On CDNA hardware wave-fronts (warps) execute always in
+  // lock step. Though it might still be important to signal
+  // that the compiler can't reorder code around certain code
+  // sections that rely on data sharing mecanisms like LDS
+  // (shared memory). So this implements a No-op but is seen
+  // by the compiler as having side effects.
+  __asm__("s_nop 0");
 
-    // A saffest option, arguably less performant would be to use:
-    // __asm__("s_waitcnt lgkmcnt(0)"); Í
-    // to explicitly do a memory fence. 
+  // A saffest option, arguably less performant would be to use:
+  // __asm__("s_waitcnt lgkmcnt(0)"); Í
+  // to explicitly do a memory fence.
 }
 // AMDGCN only support this rounding mode.
 #define __fdiv_rd __fdiv_rn
 #else
-#define __align__(x) __attribute__((aligned (x)))
+#define __align__(x) __attribute__((aligned(x)))
 #endif
 
 //
 // HIP types
 //
-#define CUBLAS_COMPUTE_32F                        HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F               HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
-#define CUBLAS_COMPUTE_32F_FAST_TF32              HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative.
-#define CUBLAS_DIAG_NON_UNIT                      HIPBLAS_DIAG_NON_UNIT
-#define CUBLAS_FILL_MODE_LOWER                    HIPBLAS_FILL_MODE_LOWER
-#define CUBLAS_FILL_MODE_UPPER                    HIPBLAS_FILL_MODE_UPPER
-#define CUBLAS_GEMM_DEFAULT                       HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP             HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs.
-#define CUBLAS_OP_C                               HIPBLAS_OP_C
-#define CUBLAS_OP_N                               HIPBLAS_OP_N
-#define CUBLAS_OP_N                               HIPBLAS_OP_N
-#define CUBLAS_OP_T                               HIPBLAS_OP_T
-#define CUBLAS_R_32F                              HIPBLAS_R_32F
-#define CUBLAS_R_64F                              HIPBLAS_R_64F
-#define CUBLAS_SIDE_LEFT                          HIPBLAS_SIDE_LEFT
-#define CUBLAS_STATUS_ALLOC_FAILED                HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_ARCH_MISMATCH               HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_EXECUTION_FAILED            HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR              HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_INVALID_VALUE               HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_LICENSE_ERROR               HIPBLAS_STATUS_UNKNOWN
-#define CUBLAS_STATUS_MAPPING_ERROR               HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_NOT_INITIALIZED             HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_NOT_SUPPORTED               HIPBLAS_STATUS_NOT_SUPPORTED
-#define CUBLAS_STATUS_SUCCESS                     HIPBLAS_STATUS_SUCCESS
-#define CUDA_R_32F                                HIP_R_32F
-#define CUDA_R_64F                                HIP_R_64F
-#define CUFFT_R2C                                 HIPFFT_R2C
-#define CUFFT_SUCCESS                             HIPFFT_SUCCESS
-#define CURAND_RNG_PSEUDO_DEFAULT                 HIPRAND_RNG_PSEUDO_DEFAULT
-#define CURAND_STATUS_ALLOCATION_FAILED           HIPRAND_STATUS_ALLOCATION_FAILED
-#define CURAND_STATUS_ARCH_MISMATCH               HIPRAND_STATUS_ARCH_MISMATCH
-#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED   HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
-#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INITIALIZATION_FAILED       HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INTERNAL_ERROR              HIPRAND_STATUS_INTERNAL_ERROR
-#define CURAND_STATUS_LAUNCH_FAILURE              HIPRAND_STATUS_LAUNCH_FAILURE
-#define CURAND_STATUS_LENGTH_NOT_MULTIPLE         HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
-#define CURAND_STATUS_NOT_INITIALIZED             HIPRAND_STATUS_NOT_INITIALIZED
-#define CURAND_STATUS_OUT_OF_RANGE                HIPRAND_STATUS_OUT_OF_RANGE
-#define CURAND_STATUS_PREEXISTING_FAILURE         HIPRAND_STATUS_PREEXISTING_FAILURE
-#define CURAND_STATUS_SUCCESS                     HIPRAND_STATUS_SUCCESS
-#define CURAND_STATUS_TYPE_ERROR                  HIPRAND_STATUS_TYPE_ERROR
-#define CURAND_STATUS_VERSION_MISMATCH            HIPRAND_STATUS_VERSION_MISMATCH
-#define CUSPARSE_ACTION_NUMERIC                   HIPSPARSE_ACTION_NUMERIC
-#define CUSPARSE_INDEX_32I                        HIPSPARSE_INDEX_32I
-#define CUSPARSE_INDEX_BASE_ZERO                  HIPSPARSE_INDEX_BASE_ZERO
-#define CUSPARSE_OPERATION_NON_TRANSPOSE          HIPSPARSE_OPERATION_NON_TRANSPOSE
-#define CUSPARSE_OPERATION_TRANSPOSE              HIPSPARSE_OPERATION_TRANSPOSE
-#define CUSPARSE_ORDER_COL                        HIPSPARSE_ORDER_COLUMN
-#define CUSPARSE_SPMM_CSR_ALG2                    HIPSPARSE_SPMM_CSR_ALG2
-#define CUSPARSE_STATUS_ALLOC_FAILED              HIPSPARSE_STATUS_ALLOC_FAILED
-#define CUSPARSE_STATUS_ARCH_MISMATCH             HIPSPARSE_STATUS_ARCH_MISMATCH
-#define CUSPARSE_STATUS_EXECUTION_FAILED          HIPSPARSE_STATUS_EXECUTION_FAILED
-#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
-#define CUSPARSE_STATUS_INTERNAL_ERROR            HIPSPARSE_STATUS_INTERNAL_ERROR
-#define CUSPARSE_STATUS_INVALID_VALUE             HIPSPARSE_STATUS_INVALID_VALUE
-#define CUSPARSE_STATUS_MAPPING_ERROR             HIPSPARSE_STATUS_MAPPING_ERROR
-#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
-#define CUSPARSE_STATUS_NOT_INITIALIZED           HIPSPARSE_STATUS_NOT_INITIALIZED
-#define CUSPARSE_STATUS_NOT_SUPPORTED             HIPSPARSE_STATUS_NOT_SUPPORTED
-#define CUSPARSE_STATUS_SUCCESS                   HIPSPARSE_STATUS_SUCCESS
-#define CUSPARSE_STATUS_ZERO_PIVOT                HIPSPARSE_STATUS_ZERO_PIVOT
-#define cuDeviceGetName                           hipDeviceGetName
-#define cuMemGetInfo_v2                           hipMemGetInfo
-#define cublasComputeType_t                       hipblasDatatype_t
-#define cublasCreate                              hipblasCreate
-#define cublasDasum_v2                            hipblasDasum
-#define cublasDaxpy_v2                            hipblasDaxpy
-#define cublasDcopy_v2                            hipblasDcopy
-#define cublasDdot_v2                             hipblasDdot
-#define cublasDestroy                             hipblasDestroy
-#define cublasDgemmBatched                        hipblasDgemmBatched
-#define cublasDgemm_v2                            hipblasDgemm
-#define cublasDgemv_v2                            hipblasDgemv
-#define cublasDger_v2                             hipblasDger
-#define cublasDnrm2_v2                            hipblasDnrm2
-#define cublasDscal_v2                            hipblasDscal
-#define cublasDspmv_v2                            hipblasDspmv
-#define cublasDspr_v2                             hipblasDspr
-#define cublasDsyrk_v2                            hipblasDsyrk
-#define cublasDtpmv_v2                            hipblasDtpmv
-#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast<double*>(i),j,k,l)
-#define cublasFillMode_t                          hipblasFillMode_t
-#define cublasGemmAlgo_t                          hipblasGemmAlgo_t
-#define cublasGemmBatchedEx                       hipblasGemmBatchedEx
-#define cublasGemmEx                              hipblasGemmEx
-#define cublasGemmStridedBatchedEx                hipblasGemmStridedBatchedEx
-#define cublasHandle_t                            hipblasHandle_t
-#define cublasOperation_t                         hipblasOperation_t
-#define cublasSasum_v2                            hipblasSasum
-#define cublasSaxpy_v2                            hipblasSaxpy
-#define cublasScopy_v2                            hipblasScopy
-#define cublasSdot_v2                             hipblasSdot
-#define cublasSetStream                           hipblasSetStream
-#define cublasSgemv_v2                            hipblasSgemv
-#define cublasSger_v2                             hipblasSger
-#define cublasSnrm2_v2                            hipblasSnrm2
-#define cublasSscal_v2                            hipblasSscal
-#define cublasSspmv_v2                            hipblasSspmv
-#define cublasSspr_v2                             hipblasSspr
-#define cublasSsyrk_v2                            hipblasSsyrk
-#define cublasStatus_t                            hipblasStatus_t
-#define cublasStatus_t                            hipblasStatus_t
-#define cublasStpmv_v2                            hipblasStpmv
-#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l)   hipblasStrsm(a,b,c,d,e,f,g,h,const_cast<float*>(i),j,k,l)
-#define cudaComputeModeExclusive                  hipComputeModeExclusive
-#define cudaComputeModeExclusiveProcess           hipComputeModeExclusiveProcess
-#define cudaDataType                              hipDataType
-#define cudaDevAttrWarpSize                       hipDeviceAttributeWarpSize
-#define cudaDeviceGetAttribute                    hipDeviceGetAttribute
-#define cudaDeviceProp                            hipDeviceProp_t
-#define cudaDeviceReset                           hipDeviceReset
-#define cudaDeviceSynchronize                     hipDeviceSynchronize
-#define cudaErrorDeviceAlreadyInUse               hipErrorContextAlreadyInUse
-#define cudaErrorInvalidDevice                    hipErrorInvalidDevice
-#define cudaError_t                               hipError_t
-#define cudaEventCreate                           hipEventCreate
-#define cudaEventCreateWithFlags                  hipEventCreateWithFlags
-#define cudaEventDestroy                          hipEventDestroy
-#define cudaEventDisableTiming                    hipEventDisableTiming
-#define cudaEventRecord                           hipEventRecord
-#define cudaEventSynchronize                      hipEventSynchronize
-#define cudaEvent_t                               hipEvent_t
-#define cudaFree                                  hipFree
-#define cudaFreeHost                              hipFreeHost
-#define cudaGetDevice                             hipGetDevice
-#define cudaGetDeviceCount                        hipGetDeviceCount
-#define cudaGetDeviceProperties                   hipGetDeviceProperties
-#define cudaGetErrorName                          hipGetErrorName
-#define cudaGetErrorString                        hipGetErrorString
-#define cudaGetErrorString                        hipGetErrorString
-#define cudaGetLastError                          hipGetLastError
-#define cudaHostRegister                          hipHostRegister
-#define cudaHostRegisterDefault                   hipHostRegisterDefault
-#define cudaHostUnregister                        hipHostUnregister
-#define cudaLaunchHostFunc                        hipLaunchHostFunc
-#define cudaMalloc                                hipMalloc
-#define cudaMallocHost                            hipHostMalloc
-#define cudaMallocPitch                           hipMallocPitch
-#define cudaMemcpy                                hipMemcpy
-// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized 
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F \
+  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
+                 // replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_TF32 \
+  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
+                 // replacements for the tensor cores alternative.
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP \
+  HIPBLAS_GEMM_DEFAULT  // TODO: Verify regular GEMMs are viable replacements
+                        // for explicit tensor GEMMs.
+#define CUBLAS_OP_C HIPBLAS_OP_C
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_R_32F HIPBLAS_R_32F
+#define CUBLAS_R_64F HIPBLAS_R_64F
+#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUDA_R_32F HIP_R_32F
+#define CUDA_R_64F HIP_R_64F
+#define CUFFT_R2C HIPFFT_R2C
+#define CUFFT_SUCCESS HIPFFT_SUCCESS
+#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED \
+  HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
+#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
+#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
+#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
+#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
+#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
+#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES \
+  HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
+#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED \
+  HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
+#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
+#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
+#define cuDeviceGetName hipDeviceGetName
+#define cuMemGetInfo_v2 hipMemGetInfo
+#define cublasComputeType_t hipblasDatatype_t
+#define cublasCreate hipblasCreate
+#define cublasDasum_v2 hipblasDasum
+#define cublasDaxpy_v2 hipblasDaxpy
+#define cublasDcopy_v2 hipblasDcopy
+#define cublasDdot_v2 hipblasDdot
+#define cublasDestroy hipblasDestroy
+#define cublasDgemmBatched hipblasDgemmBatched
+#define cublasDgemm_v2 hipblasDgemm
+#define cublasDgemv_v2 hipblasDgemv
+#define cublasDger_v2 hipblasDger
+#define cublasDnrm2_v2 hipblasDnrm2
+#define cublasDscal_v2 hipblasDscal
+#define cublasDspmv_v2 hipblasDspmv
+#define cublasDspr_v2 hipblasDspr
+#define cublasDsyrk_v2 hipblasDsyrk
+#define cublasDtpmv_v2 hipblasDtpmv
+#define cublasDtrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
+  hipblasDtrsm(a, b, c, d, e, f, g, h, const_cast<double*>(i), j, k, l)
+#define cublasFillMode_t hipblasFillMode_t
+#define cublasGemmAlgo_t hipblasGemmAlgo_t
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasOperation_t hipblasOperation_t
+#define cublasSasum_v2 hipblasSasum
+#define cublasSaxpy_v2 hipblasSaxpy
+#define cublasScopy_v2 hipblasScopy
+#define cublasSdot_v2 hipblasSdot
+#define cublasSetStream hipblasSetStream
+#define cublasSgemv_v2 hipblasSgemv
+#define cublasSger_v2 hipblasSger
+#define cublasSnrm2_v2 hipblasSnrm2
+#define cublasSscal_v2 hipblasSscal
+#define cublasSspmv_v2 hipblasSspmv
+#define cublasSspr_v2 hipblasSspr
+#define cublasSsyrk_v2 hipblasSsyrk
+#define cublasStatus_t hipblasStatus_t
+#define cublasStatus_t hipblasStatus_t
+#define cublasStpmv_v2 hipblasStpmv
+#define cublasStrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
+  hipblasStrsm(a, b, c, d, e, f, g, h, const_cast<float*>(i), j, k, l)
+#define cudaComputeModeExclusive hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
+#define cudaDataType hipDataType
+#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
+#define cudaErrorInvalidDevice hipErrorInvalidDevice
+#define cudaError_t hipError_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterDefault hipHostRegisterDefault
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocPitch hipMallocPitch
+#define cudaMemcpy hipMemcpy
+// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized
 // copies, which should be canceled by ROCm 5.7.1+. Then the following would
 // be sufficient:
 // #define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \
-    [&]() -> hipError_t { \
-        if (width && height) \
-            return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \
-        return hipSuccess; \
-    }()
-#define cudaMemcpyAsync                           hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice                  hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost                    hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice                    hipMemcpyHostToDevice
-#define cudaMemGetInfo                            hipMemGetInfo
-#define cudaMemset2DAsync                         hipMemset2DAsync
-#define cudaMemsetAsync                           hipMemsetAsync
-#define cudaProfilerStop                          hipProfilerStop
-#define cudaSetDevice                             hipSetDevice
-#define cudaStreamCreate                          hipStreamCreate
-#define cudaStreamCreateWithFlags                 hipStreamCreateWithFlags
-#define cudaStreamDestroy                         hipStreamDestroy
-#define cudaStreamNonBlocking                     hipStreamNonBlocking
-#define cudaStreamPerThread                       ((hipStream_t)2)
-#define cudaStreamSynchronize                     hipStreamSynchronize
-#define cudaStreamWaitEvent                       hipStreamWaitEvent
-#define cudaStream_t                              hipStream_t
-#define cudaSuccess                               hipSuccess
-#define cufftComplex                              hipfftComplex
-#define cufftDestroy                              hipfftDestroy
-#define cufftExecR2C                              hipfftExecR2C
-#define cufftHandle                               hipfftHandle
-#define cufftPlanMany                             hipfftPlanMany
-#define cufftSetStream                            hipfftSetStream
-#define curandCreateGenerator                     hiprandCreateGenerator
-#define curandDestroyGenerator                    hiprandDestroyGenerator
-#define curandGenerateNormal                      hiprandGenerateNormal
-#define curandGenerateNormalDouble                hiprandGenerateNormalDouble
-#define curandGenerateUniform                     hiprandGenerateUniform
-#define curandGenerateUniformDouble               hiprandGenerateUniformDouble
-#define curandGenerator_t                         hiprandGenerator_t
-#define curandSetGeneratorOffset                  hiprandSetGeneratorOffset
-#define curandSetGeneratorOrdering(x,y)           0 // HIP does not support generator ordeing.
-#define curandSetPseudoRandomGeneratorSeed        hiprandSetPseudoRandomGeneratorSeed
-#define curandSetStream                           hiprandSetStream
-#define curandStatus_t                            hiprandStatus_t
-#define cusolverDnCreate                          hipsolverDnCreate
-#define cusolverDnDestroy                         hipsolverDnDestroy
-#define cusolverDnHandle_t                        hipsolverDnHandle_t
-#define cusolverDnSetStream                       hipsolverDnSetStream
-#define cusolverDnSpotrf                          hipsolverDnSpotrf
-#define cusolverDnSpotrfBatched                   hipsolverDnSpotrfBatched
-#define cusolverDnSpotrf_bufferSize               hipsolverDnSpotrf_bufferSize
-#define cusolverDnSpotrs                          hipsolverDnSpotrs
-#define cusolverDnSpotrsBatched                   hipsolverDnSpotrsBatched
-#define cusparseAction_t                          hipsparseAction_t
-#define cusparseCreate                            hipsparseCreate
-#define cusparseCreateCsr                         hipsparseCreateCsr
-#define cusparseCreateDnMat                       hipsparseCreateDnMat
-#define cusparseCreateMatDescr                    hipsparseCreateMatDescr
-#define cusparseDcsr2csc                          hipsparseDcsr2csc
-#define cusparseDestroy                           hipsparseDestroy
-#define cusparseDestroy                           hipsparseDestroy
-#define cusparseDestroyDnMat                      hipsparseDestroyDnMat
-#define cusparseDestroyMatDescr                   hipsparseDestroyMatDescr
-#define cusparseDestroySpMat                      hipsparseDestroySpMat
-#define cusparseDnMatDescr_t                      hipsparseDnMatDescr_t
-#define cusparseGetMatIndexBase                   hipsparseGetMatIndexBase
-#define cusparseHandle_t                          hipsparseHandle_t
-#define cusparseIndexBase_t                       hipsparseIndexBase_t
-#define cusparseMatDescr_t                        hipsparseMatDescr_t
-#define cusparseOperation_t                       hipsparseOperation_t
-#define cusparseScsr2csc                          hipsparseScsr2csc
-#define cusparseSetStream                         hipsparseSetStream
-#define cusparseSpMM                              hipsparseSpMM
-#define cusparseSpMM_bufferSize                   hipsparseSpMM_bufferSize
-#define cusparseSpMatDescr_t                      hipsparseSpMatDescr_t
-#define cusparseStatus_t                          hipsparseStatus_t
-#define nvtxRangePop                              roctxRangePop
-#define nvtxRangePush                             roctxRangePush
-#define nvtxRangePushA                            roctxRangePushA
+#define cudaMemcpy2DAsync(a, b, c, d, width, height, e, f)      \
+  [&]() -> hipError_t {                                         \
+    if (width && height)                                        \
+      return hipMemcpy2DAsync(a, b, c, d, width, height, e, f); \
+    return hipSuccess;                                          \
+  }()
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMemset2DAsync hipMemset2DAsync
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaProfilerStop hipProfilerStop
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread ((hipStream_t)2)
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cufftComplex hipfftComplex
+#define cufftDestroy hipfftDestroy
+#define cufftExecR2C hipfftExecR2C
+#define cufftHandle hipfftHandle
+#define cufftPlanMany hipfftPlanMany
+#define cufftSetStream hipfftSetStream
+#define curandCreateGenerator hiprandCreateGenerator
+#define curandDestroyGenerator hiprandDestroyGenerator
+#define curandGenerateNormal hiprandGenerateNormal
+#define curandGenerateNormalDouble hiprandGenerateNormalDouble
+#define curandGenerateUniform hiprandGenerateUniform
+#define curandGenerateUniformDouble hiprandGenerateUniformDouble
+#define curandGenerator_t hiprandGenerator_t
+#define curandSetGeneratorOffset hiprandSetGeneratorOffset
+#define curandSetGeneratorOrdering(x, y) \
+  0  // HIP does not support generator ordeing.
+#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
+#define curandSetStream hiprandSetStream
+#define curandStatus_t hiprandStatus_t
+#define cusolverDnCreate hipsolverDnCreate
+#define cusolverDnDestroy hipsolverDnDestroy
+#define cusolverDnHandle_t hipsolverDnHandle_t
+#define cusolverDnSetStream hipsolverDnSetStream
+#define cusolverDnSpotrf hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
+#define cusparseAction_t hipsparseAction_t
+#define cusparseCreate hipsparseCreate
+#define cusparseCreateCsr hipsparseCreateCsr
+#define cusparseCreateDnMat hipsparseCreateDnMat
+#define cusparseCreateMatDescr hipsparseCreateMatDescr
+#define cusparseDcsr2csc hipsparseDcsr2csc
+#define cusparseDestroy hipsparseDestroy
+#define cusparseDestroy hipsparseDestroy
+#define cusparseDestroyDnMat hipsparseDestroyDnMat
+#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
+#define cusparseDestroySpMat hipsparseDestroySpMat
+#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
+#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
+#define cusparseHandle_t hipsparseHandle_t
+#define cusparseIndexBase_t hipsparseIndexBase_t
+#define cusparseMatDescr_t hipsparseMatDescr_t
+#define cusparseOperation_t hipsparseOperation_t
+#define cusparseScsr2csc hipsparseScsr2csc
+#define cusparseSetStream hipsparseSetStream
+#define cusparseSpMM hipsparseSpMM
+#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
+#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
+#define cusparseStatus_t hipsparseStatus_t
+#define nvtxRangePop roctxRangePop
+#define nvtxRangePush roctxRangePush
+#define nvtxRangePushA roctxRangePushA
 //
 // HIPCUB namespace.
 //
@@ -256,8 +268,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {
 //
 #define CUDART_CB
 
+//
+// Math constants
+//
+#define CUDART_INF HIP_INF
+#define CUDART_INF_F HIP_INF_F
+
+//
+// GPU static hardware characteristics. 
+//
 #define GPU_WARP_SIZE 64
 #define GPU_MAX_THREADS_PER_BLOCK 1024
-#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE)
-#endif //__HIPIFY_H__
-
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
+#endif  //__HIPIFY_H__

From 3aaa32637850c919af905b1c799b3f4919d804cd Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 7 Nov 2023 00:00:01 +0000
Subject: [PATCH 45/76] Fix more formating to Google style.

---
 src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu | 3 ++-
 src/cudamatrix/cu-kernels.cu                                | 2 +-
 src/hip/hipify.h                                            | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index da2ba24bd90..5b94c34e829 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -222,7 +222,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
   int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
                 GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
   if (threads > GPU_MAX_THREADS_PER_BLOCK)
-    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is GPU_MAX_THREADS_PER_BLOCK threads
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                          // GPU_MAX_THREADS_PER_BLOCK threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8d5784acb52..9127819eca5 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -32,11 +32,11 @@
 #include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 
-#include "hipify.h"
 #include <hipcub/block/block_reduce.hpp>
 #include <hipcub/hipcub.hpp>
 
 #include "cudamatrix/cu-kernels-ansi.h"
+#include "hipify.h"
 #else
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
index e9ca483d022..459372e68b8 100644
--- a/src/hip/hipify.h
+++ b/src/hip/hipify.h
@@ -275,7 +275,7 @@ inline __device__ void __syncwarp(unsigned mask = 0xffffffff) {
 #define CUDART_INF_F HIP_INF_F
 
 //
-// GPU static hardware characteristics. 
+// GPU static hardware characteristics.
 //
 #define GPU_WARP_SIZE 64
 #define GPU_MAX_THREADS_PER_BLOCK 1024

From 6ebab7023b01a4270cbd07b5c3bfce7f1ca2c461 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 7 Nov 2023 00:25:49 +0000
Subject: [PATCH 46/76] Fix header ordering.

---
 src/cudamatrix/cu-kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9127819eca5..9df6cea6e9d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -32,8 +32,8 @@
 #include <hip/hip_math_constants.h>
 #include <hip/hip_runtime.h>
 
-#include <hipcub/block/block_reduce.hpp>
 #include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_reduce.hpp>
 
 #include "cudamatrix/cu-kernels-ansi.h"
 #include "hipify.h"

From 7efdeaeb10ed0ae2593ee69faa04b5172a39aba9 Mon Sep 17 00:00:00 2001
From: Samuel Antao <samuel.antao@amd.com>
Date: Tue, 7 Nov 2023 05:16:09 -0600
Subject: [PATCH 47/76] Add GPU characteristics for CUDA.

---
 src/cudamatrix/cu-common.h   | 4 ++++
 src/cudamatrix/cu-kernels.cu | 1 +
 2 files changed, 5 insertions(+)

diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 934668da6f2..3206fe7e7f4 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -45,6 +45,10 @@
 #include <curand.h>
 #include <cusparse.h>
 #include <nvToolsExt.h>
+
+#define GPU_WARP_SIZE 32
+#define GPU_MAX_THREADS_PER_BLOCK 1024
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
 #endif
 
 #define CU_SAFE_CALL(fun) \
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9df6cea6e9d..b3c3165bd96 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -39,6 +39,7 @@
 #include "hipify.h"
 #else
 #include <math_constants.h>
+#include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION

From 700bf93631b8c999f0421fffd74b4a29eb2685a3 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Thu, 9 Nov 2023 11:49:12 +0100
Subject: [PATCH 48/76] [tools] Replace uses of Python distutils

The `distutils` packages has been removed in Python 3.12:
https://docs.python.org/3.11/distutils/index.html

The `sysconfig` package is available since Python 3.2 and provides the necessary
replacement functionality:
https://docs.python.org/3/library/sysconfig.html
---
 tools/extras/install_cffi.sh          | 2 +-
 tools/extras/install_mmseg.sh         | 8 ++++----
 tools/extras/install_phonetisaurus.sh | 8 ++++----
 tools/extras/install_sequitur.sh      | 8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tools/extras/install_cffi.sh b/tools/extras/install_cffi.sh
index dc7f91724a7..5ac9904173e 100755
--- a/tools/extras/install_cffi.sh
+++ b/tools/extras/install_cffi.sh
@@ -35,7 +35,7 @@ echo "**** Installing Cffi and dependencies"
 
 echo "Checking for Python-Dev"
 # copied from https://stackoverflow.com/questions/4848566/check-for-existence-of-python-dev-files-from-bash-script
-if [ ! -e $(python -c 'from distutils.sysconfig import get_makefile_filename as m; print m()') ]; then
+if [ ! -e $(python -c 'from sysconfig import get_makefile_filename as m; print m()') ]; then
     echo "On Debian/Ubuntu like system install by 'sudo apt-get python-dev' package."
     echo "On Fedora by 'yum install python-devel'"
     echo "On Mac OS X by 'brew install python'"
diff --git a/tools/extras/install_mmseg.sh b/tools/extras/install_mmseg.sh
index a76b98e2061..e6e17716718 100755
--- a/tools/extras/install_mmseg.sh
+++ b/tools/extras/install_mmseg.sh
@@ -16,13 +16,13 @@ fi
 
 
 # Install python-devel package if not already available
-# first, makes sure distutils.sysconfig usable
-if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+# first, makes sure sysconfig is usable
+if ! $(python -c "import sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
+  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_phonetisaurus.sh b/tools/extras/install_phonetisaurus.sh
index 8a07c5f5ca5..e407978972f 100755
--- a/tools/extras/install_phonetisaurus.sh
+++ b/tools/extras/install_phonetisaurus.sh
@@ -15,16 +15,16 @@ fi
   echo "You must call this script from the tools/ directory" && exit 1;
 
 # Install python-devel package if not already available
-# first, makes sure distutils.sysconfig usable
+# first, makes sure sysconfig is usable
 # We are not currently compiling the bindings by default, but it seems
 # worth it to keep this section as we do have them and they will
 # probably be used.
-if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+if ! $(python -c "import sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
+  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index b70e6cbb447..62b27e451ac 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -15,13 +15,13 @@ fi
   echo "You must call this script from the tools/ directory" && exit 1;
 
 # Install python-devel package if not already available
-# first, makes sure distutils.sysconfig usable
-if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+# first, makes sure sysconfig is usable
+if ! $(python -c "import sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
+  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then

From cd2b8354b7d2e3a734b8d87d44c566cb4d8f2d0e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 10 Nov 2023 16:39:48 +0800
Subject: [PATCH 49/76] Fix #4870, spurious error in ProcessNonemitting; queue
 can validly be empty.

# Conflicts:
#	src/cudamatrix/cu-kernels.cu
---
 src/decoder/lattice-simple-decoder.cc | 70 ++++++++++++---------------
 1 file changed, 31 insertions(+), 39 deletions(-)

diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc
index cc8712e854d..87378f93bbd 100644
--- a/src/decoder/lattice-simple-decoder.cc
+++ b/src/decoder/lattice-simple-decoder.cc
@@ -45,8 +45,8 @@ void LatticeSimpleDecoder::InitDecoding() {
 
 bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) {
   InitDecoding();
-  
-  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {  
+
+  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
     if (NumFramesDecoded() % config_.prune_interval == 0)
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
     ProcessEmitting(decodable);
@@ -57,7 +57,7 @@ bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) {
     ProcessNonemitting();
   }
   FinalizeDecoding();
-  
+
   // Returns true if we have any kind of traceback available (not necessarily
   // to the end state; query ReachedFinal() for that).
   return !final_costs_.empty();
@@ -88,9 +88,9 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst,
   if (decoding_finalized_ && !use_final_probs)
     KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
               << "GetRawLattice() with use_final_probs == false";
-  
+
   unordered_map<Token*, BaseFloat> final_costs_local;
-  
+
   const unordered_map<Token*, BaseFloat> &final_costs =
       (decoding_finalized_ ? final_costs_ : final_costs_local);
 
@@ -100,7 +100,7 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst,
   ofst->DeleteStates();
   int32 num_frames = NumFramesDecoded();
   KALDI_ASSERT(num_frames > 0);
-  const int32 bucket_count = num_toks_/2 + 3;  
+  const int32 bucket_count = num_toks_/2 + 3;
   unordered_map<Token*, StateId> tok_map(bucket_count);
   // First create all states.
   for (int32 f = 0; f <= num_frames; f++) {
@@ -169,10 +169,10 @@ bool LatticeSimpleDecoder::GetLattice(
   fst::ILabelCompare<LatticeArc> ilabel_comp;
   ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes
   // lattice-determinization more efficient.
-    
+
   fst::DeterminizeLatticePrunedOptions lat_opts;
   lat_opts.max_mem = config_.det_opts.max_mem;
-    
+
   DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
   raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed.
   Connect(ofst); // Remove unreachable states... there might be
@@ -196,7 +196,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken(
     bool emitting, bool *changed) {
   KALDI_ASSERT(frame < active_toks_.size());
   Token *&toks = active_toks_[frame].toks;
-    
+
   unordered_map<StateId, Token*>::iterator find_iter = cur_toks_.find(state);
   if (find_iter == cur_toks_.end()) { // no such token presently.
     // Create one.
@@ -221,7 +221,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken(
     return tok;
   }
 }
-  
+
 // delta is the amount by which the extra_costs must
 // change before it sets "extra_costs_changed" to true.  If delta is larger,
 // we'll tend to go back less far toward the beginning of the file.
@@ -242,7 +242,7 @@ void LatticeSimpleDecoder::PruneForwardLinks(
       warned_ = true;
     }
   }
-    
+
   bool changed = true;
   while (changed) {
     changed = false;
@@ -300,7 +300,7 @@ void LatticeSimpleDecoder::ComputeFinalCosts(
   BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
   BaseFloat best_cost = infinity,
       best_cost_with_final = infinity;
-  
+
   for (unordered_map<StateId, Token*>::const_iterator iter = cur_toks_.begin();
        iter != cur_toks_.end(); ++iter) {
     StateId state = iter->first;
@@ -336,19 +336,19 @@ void LatticeSimpleDecoder::ComputeFinalCosts(
 // on the final frame.  If there are final tokens active, it uses the final-probs
 // for pruning, otherwise it treats all tokens as final.
 void LatticeSimpleDecoder::PruneForwardLinksFinal() {
-  KALDI_ASSERT(!active_toks_.empty());  
+  KALDI_ASSERT(!active_toks_.empty());
   int32 frame_plus_one = active_toks_.size() - 1;
 
   if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen.
     KALDI_WARN << "No tokens alive at end of file\n";
 
-  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;  
+  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;
   ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
   decoding_finalized_ = true;
   // We're about to delete some of the tokens active on the final frame, so we
   // clear cur_toks_ because otherwise it would then contain dangling pointers.
   cur_toks_.clear();
-  
+
   // Now go through tokens on this frame, pruning forward links...  may have to
   // iterate a few times until there is no more change, because the list is not
   // in topological order.  This is a modified version of the code in
@@ -429,7 +429,7 @@ BaseFloat LatticeSimpleDecoder::FinalRelativeCost() const {
     return final_relative_cost_;
   }
 }
-  
+
 // Prune away any tokens on this frame that have no forward links. [we don't do
 // this in PruneForwardLinks because it would give us a problem with dangling
 // pointers].
@@ -453,14 +453,14 @@ void LatticeSimpleDecoder::PruneTokensForFrame(int32 frame) {
     }
   }
 }
-  
+
 // Go backwards through still-alive tokens, pruning them, starting not from
 // the current frame (where we want to keep all tokens) but from the frame before
 // that.  We go backwards through the frames and stop when we reach a point
 // where the delta-costs are not changing (and the delta controls when we consider
 // a cost to have "not changed").
 void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
-  int32 cur_frame_plus_one = NumFramesDecoded();  
+  int32 cur_frame_plus_one = NumFramesDecoded();
   int32 num_toks_begin = num_toks_;
   // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
   // one to get the corresponding index for the decodable object.
@@ -468,7 +468,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
     // Reason why we need to prune forward links in this situation:
     // (1) we have never pruned them
     // (2) we never pruned the forward links on the next frame, which
-    //     
+    //
     if (active_toks_[f].must_prune_forward_links) {
       bool extra_costs_changed = false, links_pruned = false;
       PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta);
@@ -478,7 +478,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
         active_toks_[f].must_prune_tokens = true;
       active_toks_[f].must_prune_forward_links = false;
     }
-    if (f+1 < cur_frame_plus_one && 
+    if (f+1 < cur_frame_plus_one &&
         active_toks_[f+1].must_prune_tokens) {
       PruneTokensForFrame(f+1);
       active_toks_[f+1].must_prune_tokens = false;
@@ -493,20 +493,20 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
 // (optionally) on the final frame.  Takes into account the final-prob of
 // tokens.  This function used to be called PruneActiveTokensFinal().
 void LatticeSimpleDecoder::FinalizeDecoding() {
-  int32 final_frame_plus_one = NumFramesDecoded();  
+  int32 final_frame_plus_one = NumFramesDecoded();
   int32 num_toks_begin = num_toks_;
   PruneForwardLinksFinal();
-  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {  
+  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {
     bool b1, b2; // values not used.
     BaseFloat dontcare = 0.0;
     PruneForwardLinks(f, &b1, &b2, dontcare);
     PruneTokensForFrame(f + 1);
   }
-  PruneTokensForFrame(0); 
+  PruneTokensForFrame(0);
   KALDI_VLOG(3) << "pruned tokens from " << num_toks_begin
                 << " to " << num_toks_;
 }
-  
+
 void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
   int32 frame = active_toks_.size() - 1; // frame is the frame-index
                                          // (zero-based) used to get likelihoods
@@ -538,9 +538,9 @@ void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
         // AddToken adds the next_tok to cur_toks_ (if not already present).
         Token *next_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                          true, NULL);
-          
+
         // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-        tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
+        tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel,
                                      graph_cost, ac_cost, tok->links);
       }
     }
@@ -553,7 +553,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
   // Note: "frame" is the time-index we just processed, or -1 if
   // we are processing the nonemitting transitions before the
   // first frame (called from InitDecoding()).
-    
+
   // Processes nonemitting arcs for one frame.  Propagates within
   // cur_toks_.  Note-- this queue structure is is not very optimal as
   // it may cause us to process states unnecessarily (e.g. more than once),
@@ -569,15 +569,9 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
       queue.push_back(state);
     best_cost = std::min(best_cost, iter->second->tot_cost);
   }
-  if (queue.empty()) {
-    if (!warned_) {
-      KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
-                << frame;
-      warned_ = true;
-    }
-  }
+
   BaseFloat cutoff = best_cost + config_.beam;
-    
+
   while (!queue.empty()) {
     StateId state = queue.back();
     queue.pop_back();
@@ -600,10 +594,10 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
           bool changed;
           Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                           false, &changed);
-          
+
           tok->links = new ForwardLink(new_tok, 0, arc.olabel,
                                        graph_cost, 0, tok->links);
-            
+
           // "changed" tells us whether the new token has a different
           // cost from before, or is new [if so, add into queue].
           if (changed && fst_.NumInputEpsilons(arc.nextstate) != 0)
@@ -662,5 +656,3 @@ void LatticeSimpleDecoder::PruneCurrentTokens(BaseFloat beam, unordered_map<Stat
 
 
 } // end namespace kaldi.
-
-

From e97d353091712c4727fdb1d7fd2dce1ef5b00590 Mon Sep 17 00:00:00 2001
From: Tien-Hong Lo <teinhonglo@gmail.com>
Date: Mon, 13 Nov 2023 11:54:21 +0800
Subject: [PATCH 50/76] Update fix_data_dir.sh

---
 egs/wsj/s5/utils/fix_data_dir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index ed4710d0b1f..051715f2b1e 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -54,7 +54,7 @@ function check_sorted {
 }
 
 for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
-    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
+    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames $utt_extra_files $spk_extra_files; do
   if [ -f $data/$x ]; then
     cp $data/$x $data/.backup/$x
     check_sorted $data/$x

From b7e886c13a075bd73a09d515f54fb6a159eb1472 Mon Sep 17 00:00:00 2001
From: Omer Danziger <57575138+Omerdan03@users.noreply.github.com>
Date: Thu, 14 Dec 2023 10:18:15 +0200
Subject: [PATCH 51/76] Update COPYING

The list of Individual Contributors wasn't really in alphabetical order
---
 COPYING | 106 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/COPYING b/COPYING
index 5a5cab00a29..2b0dbd4243a 100644
--- a/COPYING
+++ b/COPYING
@@ -57,72 +57,72 @@ License v 2.0 are set forth below.
 
 Individual Contributors (in alphabetical order)
 
-      Mohit Agarwal
-      Tanel Alumae
-      Gilles Boulianne
-      Lukas Burget
-      Dogan Can
-      Guoguo Chen
-      Gaofeng Cheng
+      Albert Vernon
+      Alexander Solovets
+      Allen Guo
+      Ariya Rastrow
+      Arnab Ghoshal
       Cisco Corporation
-      Pavel Denisov
-      Ilya Edrenkin
-      Ewald Enzinger
-      Joachim Fainberg
       Daniel Galvez
-      Pegah Ghahremani
-      Arnab Ghoshal
-      Ondrej Glembek
+      Daniel Povey
+      Danijel Korzinek
+      David Snyder
+      Dogan Can
+      Eduardo Silva
+      Ewald Enzinger
+      Gaofeng Cheng
+      Gaurav Kumar
+      Georg Stemmer
+      Gilles Boulianne
       Go Vivace Inc.
-      Allen Guo
-      Hossein Hadian
-      Lv Hang
-      Mirko Hannemann
+      Guoguo Chen
+      Haihua Xu
+      Hainan Xu
       Hendy Irawan
-      Navdeep Jaitly
+      Hossein Hadian
+      Ilya Edrenkin
+      Jan "Yenda" Trmal
+      Jan Silovsky
+      Joachim Fainberg
       Johns Hopkins University
-      Shiyin Kang
-      Kirill Katsnelson
-      Tom Ko
-      Danijel Korzinek
-      Gaurav Kumar
+      Karel Vesely
       Ke Li
+      Kirill Katsnelson
+      Lucas Ondel
+      Lukas Burget
+      Lv Hang
       Matthew Maciejewski
-      Vimal Manohar
-      Yajie Miao
       Microsoft Corporation
+      Minhua Wu
+      Mirko Hannemann
+      Mohit Agarwal
+      Navdeep Jaitly
+      Nickolay V. Shmyrev
+      Omid Sadjadi
+      Ondrej Glembek
+      Ondrej Platek
+      Pavel Denisov
+      Pawel Swietojanski
+      Pegah Ghahremani
+      Peter Smit
       Petr Motlicek
-      Xingyu Na
-      Vincent Nguyen
-      Lucas Ondel
-      Vassil Panayotov
-      Vijayaditya Peddinti
+      Petr Schwarz
       Phonexia s.r.o.
-      Ondrej Platek
-      Daniel Povey
-      Yanmin Qian
-      Ariya Rastrow
       Saarland University
-      Omid Sadjadi
-      Petr Schwarz
-      Yiwen Shao
-      Nickolay V. Shmyrev
-      Jan Silovsky
-      Eduardo Silva
-      Peter Smit
-      David Snyder
-      Alexander Solovets
-      Georg Stemmer
-      Pawel Swietojanski
-      Jan "Yenda" Trmal
-      Albert Vernon
-      Karel Vesely
-      Yiming Wang
       Shinji Watanabe
-      Minhua Wu
-      Haihua Xu
-      Hainan Xu
+      Shiyin Kang
+      Tanel Alumae
+      Tom Ko
+      Vassil Panayotov
+      Vijayaditya Peddinti
+      Vimal Manohar
+      Vincent Nguyen
       Xiaohui Zhang
+      Xingyu Na
+      Yajie Miao
+      Yanmin Qian
+      Yiming Wang
+      Yiwen Shao
 
 Other Source Material
 

From 77ffb5556c825adcf22138d387967c627d54c415 Mon Sep 17 00:00:00 2001
From: Ilia Dzenzeliuk <43926347+dzen03@users.noreply.github.com>
Date: Sun, 21 Jan 2024 18:08:34 +0300
Subject: [PATCH 52/76] Fix unused but set variable

---
 src/base/kaldi-error.h                    | 8 ++++----
 src/bin/matrix-sum.cc                     | 6 ++----
 src/bin/vector-sum.cc                     | 3 +--
 src/chainbin/nnet3-chain-copy-egs.cc      | 4 +---
 src/fstext/pre-determinize-inl.h          | 3 ---
 src/gmm/mle-diag-gmm-test.cc              | 7 ++-----
 src/gmm/mle-full-gmm-test.cc              | 7 ++-----
 src/gmmbin/gmm-acc-mllt-global.cc         | 4 +---
 src/ivector/ivector-extractor-test.cc     | 3 +--
 src/kwsbin/kws-search.cc                  | 2 --
 src/latbin/lattice-oracle.cc              | 3 +--
 src/latbin/lattice-prune.cc               | 3 +--
 src/latbin/lattice-to-mpe-post.cc         | 3 +--
 src/latbin/lattice-to-smbr-post.cc        | 3 +--
 src/matrix/matrix-functions.cc            | 2 --
 src/nnet2/nnet-compute-discriminative.cc  | 3 +--
 src/nnet3/nnet-example-utils.cc           | 2 --
 src/online2bin/apply-cmvn-online.cc       | 3 +--
 src/online2bin/ivector-extract-online2.cc | 3 +--
 src/tree/build-tree-utils.cc              | 2 --
 20 files changed, 21 insertions(+), 53 deletions(-)

diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index a9904a752cd..572cbb4effd 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -185,12 +185,12 @@ class MessageLogger {
 #define KALDI_ASSERT(cond)                                                     \
   do {                                                                         \
     if (cond)                                                                  \
-      (void)0;                                                                 \
+      (void)(cond);                                                            \
     else                                                                       \
       ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
   } while (0)
 #else
-#define KALDI_ASSERT(cond) (void)0
+#define KALDI_ASSERT(cond) (void)(cond)
 #endif
 
 // Some more expensive asserts only checked if this defined.
@@ -198,12 +198,12 @@ class MessageLogger {
 #define KALDI_PARANOID_ASSERT(cond)                                            \
   do {                                                                         \
     if (cond)                                                                  \
-      (void)0;                                                                 \
+      (void)(cond);                                                            \
     else                                                                       \
       ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
   } while (0)
 #else
-#define KALDI_PARANOID_ASSERT(cond) (void)0
+#define KALDI_PARANOID_ASSERT(cond) (void)(cond)
 #endif
 
 /***** THIRD-PARTY LOG-HANDLER *****/
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
index 3c93dfd0d39..6aee0c5ce78 100644
--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@@ -49,7 +49,7 @@ int32 TypeOneUsage(const ParseOptions &po,
   }
 
   int32 n_utts = 0, n_total_matrices = 0,
-      n_success = 0, n_missing = 0, n_other_errors = 0;
+      n_success = 0, n_missing = 0;
 
   for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
     std::string key = matrix_reader1.Key();
@@ -78,7 +78,6 @@ int32 TypeOneUsage(const ParseOptions &po,
                      << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
                      << " by " << matrix_out.NumCols()
                      << " primary matrix, rspecifier:" << matrix_in_fn1;
-          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No matrix found for utterance " << key << " for "
@@ -124,7 +123,7 @@ int32 TypeOneUsageAverage(const ParseOptions &po) {
   }
 
   int32 n_utts = 0, n_total_matrices = 0,
-      n_success = 0, n_missing = 0, n_other_errors = 0;
+      n_success = 0, n_missing = 0;
 
   for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
     std::string key = matrix_reader1.Key();
@@ -151,7 +150,6 @@ int32 TypeOneUsageAverage(const ParseOptions &po) {
                      << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
                      << " by " << matrix_out.NumCols()
                      << " primary matrix, rspecifier:" << matrix_in_fn1;
-          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No matrix found for utterance " << key << " for "
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index 3e622cafdc7..d03bf671245 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -52,7 +52,7 @@ int32 TypeOneUsage(const ParseOptions &po) {
   }
 
   int32 n_utts = 0, n_total_vectors = 0,
-      n_success = 0, n_missing = 0, n_other_errors = 0;
+      n_success = 0, n_missing = 0;
 
   for (; !vector_reader1.Done(); vector_reader1.Next()) {
     std::string key = vector_reader1.Key();
@@ -75,7 +75,6 @@ int32 TypeOneUsage(const ParseOptions &po) {
                      << "system " << (i + 2) << ", rspecifier: "
                      << vector_in_fns[i] << " vs " << vector_out.Dim()
                      << " primary vector, rspecifier:" << vector_in_fn1;
-          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No vector found for utterance " << key << " for "
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 0117fe2200f..60a2645b31b 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -347,7 +347,7 @@ int main(int argc, char *argv[]) {
                                             // not configurable for now.
     exclude_names.push_back(std::string("ivector"));
 
-    int64 num_read = 0, num_written = 0, num_err = 0;
+    int64 num_read = 0, num_written = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
       const std::string &key = example_reader.Key();
       NnetChainExample &eg = example_reader.Value();
@@ -361,7 +361,6 @@ int main(int argc, char *argv[]) {
         BaseFloat weight = 1.0;
         if (!egs_weight_reader.HasKey(key)) {
           KALDI_WARN << "No weight for example key " << key;
-          num_err++;
           continue;
         }
         weight = egs_weight_reader.Value(key);
@@ -371,7 +370,6 @@ int main(int argc, char *argv[]) {
       if (!eg_output_name_rspecifier.empty()) {
         if (!output_name_reader.HasKey(key)) {
           KALDI_WARN << "No new output-name for example key " << key;
-          num_err++;
           continue;
         }
         std::string new_output_name = output_name_reader.Value(key);
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index b67b0ba6fa6..ea6608ce38a 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -411,8 +411,6 @@ void PreDeterminize(MutableFst<Arc> *fst,
   std::vector<bool> d_vec(max_state+1, false);  // "done vector".  Purely for debugging.
 
 
-  size_t num_extra_det_states = 0;
-
   // (D)(v)
   while (Q.size() != 0) {
 
@@ -491,7 +489,6 @@ void PreDeterminize(MutableFst<Arc> *fst,
                 assert(m_map.count(this_pr.first) == 0);
                 m_map[this_pr.first] = k;
                 k++;
-                num_extra_det_states++;
               }
             } else {  // Create the set V_t.
               V_t.insert(this_pr.second);
diff --git a/src/gmm/mle-diag-gmm-test.cc b/src/gmm/mle-diag-gmm-test.cc
index d1af7725d20..a91832cd254 100644
--- a/src/gmm/mle-diag-gmm-test.cc
+++ b/src/gmm/mle-diag-gmm-test.cc
@@ -139,12 +139,10 @@ void test_flags_driven_update(const DiagGmm &gmm,
 
   // now both models gmm_all_update, gmm_all_update have the same params updated
   // compute loglike for models for check
-  double loglike0 = 0.0;
   double loglike1 = 0.0;
   double loglike2 = 0.0;
   for (int32 i = 0; i < feats.NumRows(); i++) {
-    loglike0 += static_cast<double>(
-      gmm.LogLikelihood(feats.Row(i)));
+    gmm.LogLikelihood(feats.Row(i));
     loglike1 += static_cast<double>(
       gmm_all_update.LogLikelihood(feats.Row(i)));
     loglike2 += static_cast<double>(
@@ -366,9 +364,8 @@ UnitTestEstimateDiagGmm() {
     est_gmm.Resize(gmm->NumGauss(),
       gmm->Dim(), flags_all);
     est_gmm.SetZero(flags_all);
-    float loglike = 0.0;
     for (size_t i = 0; i < counter; i++) {
-      loglike += est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F);
+      est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F);
     }
     test_io(*gmm, est_gmm, false, feats);  // ASCII mode
     test_io(*gmm, est_gmm, true, feats);   // Binary mode
diff --git a/src/gmm/mle-full-gmm-test.cc b/src/gmm/mle-full-gmm-test.cc
index 472db88d501..26c5460f024 100644
--- a/src/gmm/mle-full-gmm-test.cc
+++ b/src/gmm/mle-full-gmm-test.cc
@@ -200,12 +200,10 @@ void test_flags_driven_update(const FullGmm &gmm,
 
   // now both models gmm_all_update, gmm_all_update have the same params updated
   // compute loglike for models for check
-  double loglike0 = 0.0;
   double loglike1 = 0.0;
   double loglike2 = 0.0;
   for (int32 i = 0; i < feats.NumRows(); i++) {
-    loglike0 += static_cast<double>(
-      gmm.LogLikelihood(feats.Row(i)));
+    gmm.LogLikelihood(feats.Row(i));
     loglike1 += static_cast<double>(
       gmm_all_update.LogLikelihood(feats.Row(i)));
     loglike2 += static_cast<double>(
@@ -462,9 +460,8 @@ UnitTestEstimateFullGmm() {
     est_gmm.Resize(gmm->NumGauss(),
       gmm->Dim(), flags_all);
     est_gmm.SetZero(flags_all);
-    float loglike = 0.0;
     for (int32 i = 0; i < counter; i++) {
-      loglike += est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F);
+      est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F);
     }
     test_io(*gmm, est_gmm, false, feats);
     test_io(*gmm, est_gmm, true, feats);
diff --git a/src/gmmbin/gmm-acc-mllt-global.cc b/src/gmmbin/gmm-acc-mllt-global.cc
index bed91c053d3..b6b7a2b5635 100644
--- a/src/gmmbin/gmm-acc-mllt-global.cc
+++ b/src/gmmbin/gmm-acc-mllt-global.cc
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
     
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       const Matrix<BaseFloat> &mat = feature_reader.Value();      
@@ -88,7 +88,6 @@ int main(int argc, char *argv[]) {
       } else {
         if (!gselect_reader.HasKey(utt)) {
           KALDI_WARN << "No gselect information for utterance " << utt;
-          num_err++;
           continue;
         }
         const std::vector<std::vector<int32> > &gselect= gselect_reader.Value(utt);
@@ -96,7 +95,6 @@ int main(int argc, char *argv[]) {
           KALDI_WARN << "Gselect information has wrong size for utterance "
                      << utt << ", " << gselect.size() << " vs. "
                      << mat.NumRows();
-          num_err++;
           continue;
         }
         
diff --git a/src/ivector/ivector-extractor-test.cc b/src/ivector/ivector-extractor-test.cc
index cb08464fbe8..ffd5a2561cc 100644
--- a/src/ivector/ivector-extractor-test.cc
+++ b/src/ivector/ivector-extractor-test.cc
@@ -94,11 +94,10 @@ void TestIvectorExtraction(const IvectorExtractor &extractor,
       ivector_dim = extractor.IvectorDim();
   Posterior post(num_frames);
 
-  double tot_log_like = 0.0;
   for (int32 t = 0; t < num_frames; t++) {
     SubVector<BaseFloat> frame(feats, t);
     Vector<BaseFloat> posterior(fgmm.NumGauss(), kUndefined);
-    tot_log_like += fgmm.ComponentPosteriors(frame, &posterior);
+    fgmm.ComponentPosteriors(frame, &posterior);
     for (int32 i = 0; i < posterior.Dim(); i++)
       post[t].push_back(std::make_pair(i, posterior(i)));
   }
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index 8e2b2a84def..c76a5d46eb9 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -287,7 +287,6 @@ int main(int argc, char *argv[]) {
     ArcSort(&index, fst::ILabelCompare<KwsLexicographicArc>());
 
     int32 n_done = 0;
-    int32 n_fail = 0;
     for (; !keyword_reader.Done(); keyword_reader.Next()) {
       std::string key = keyword_reader.Key();
       VectorFst<StdArc> keyword = keyword_reader.Value();
@@ -336,7 +335,6 @@ int main(int argc, char *argv[]) {
         if (result_fst.Final(arc.nextstate) != Weight::One()) {
           KALDI_WARN << "The resulting FST does not have "
                      << "the expected structure for key " << key;
-          n_fail++;
           continue;
         }
 
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index 5f2513131d7..054a0676e37 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -257,7 +257,7 @@ int main(int argc, char *argv[]) {
     }
 
     int32 n_done = 0, n_fail = 0;
-    int32 tot_correct = 0, tot_substitutions = 0,
+    int32 tot_substitutions = 0,
           tot_insertions = 0, tot_deletions = 0, tot_words = 0;
 
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
@@ -320,7 +320,6 @@ int main(int argc, char *argv[]) {
         KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
                   << " / " << num_words << ", " << insertions << " insertions, "
                   << deletions << " deletions, " << substitutions << " sub ]";
-        tot_correct += correct;
         tot_substitutions += substitutions;
         tot_insertions += insertions;
         tot_deletions += deletions;
diff --git a/src/latbin/lattice-prune.cc b/src/latbin/lattice-prune.cc
index 49399f748e4..d87f5ded28f 100644
--- a/src/latbin/lattice-prune.cc
+++ b/src/latbin/lattice-prune.cc
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
     CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
 
-    int32 n_done = 0, n_err = 0;
+    int32 n_done = 0;
     int64 n_arcs_in = 0, n_arcs_out = 0,
         n_states_in = 0, n_states_out = 0;
 
@@ -86,7 +86,6 @@ int main(int argc, char *argv[]) {
       CompactLattice pruned_clat(clat);
       if (!PruneLattice(beam, &pruned_clat)) {
         KALDI_WARN << "Error pruning lattice for utterance " << key;
-        n_err++;
       }
       int64 pruned_narcs = NumArcs(pruned_clat),          
           pruned_nstates = pruned_clat.NumStates();
diff --git a/src/latbin/lattice-to-mpe-post.cc b/src/latbin/lattice-to-mpe-post.cc
index 7961cc5c438..771399a32a4 100644
--- a/src/latbin/lattice-to-mpe-post.cc
+++ b/src/latbin/lattice-to-mpe-post.cc
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
     }
 
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     double total_lat_frame_acc = 0.0, lat_frame_acc;
     double total_time = 0, lat_time;
 
@@ -114,7 +114,6 @@ int main(int argc, char *argv[]) {
       
       if (!alignments_reader.HasKey(key)) {
         KALDI_WARN << "No alignment for utterance " << key;
-        num_err++;
       } else {
         const std::vector<int32> &alignment = alignments_reader.Value(key);
         Posterior post;
diff --git a/src/latbin/lattice-to-smbr-post.cc b/src/latbin/lattice-to-smbr-post.cc
index e2772316954..6b2861b395f 100644
--- a/src/latbin/lattice-to-smbr-post.cc
+++ b/src/latbin/lattice-to-smbr-post.cc
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
     }
 
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     double total_lat_frame_acc = 0.0, lat_frame_acc;
     double total_time = 0, lat_time;
 
@@ -115,7 +115,6 @@ int main(int argc, char *argv[]) {
       
       if (!alignments_reader.HasKey(key)) {
         KALDI_WARN << "No alignment for utterance " << key;
-        num_err++;
       } else {
         const std::vector<int32> &alignment = alignments_reader.Value(key);
         Posterior post;
diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc
index 496c09f5344..6942b220da6 100644
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@@ -669,12 +669,10 @@ void ComputePca(const MatrixBase<Real> &X,
       Nsp.TopEigs(&l, &Vtmp);
     }
 
-    MatrixIndexT num_zeroed = 0;
     for (MatrixIndexT g = 0; g < G; g++) {
       if (l(g) < 0.0) {
         KALDI_WARN << "In PCA, setting element " << l(g) << " to zero.";
         l(g) = 0.0;
-        num_zeroed++;
       }
     }
     SortSvd(&l, &Vtmp); // Make sure zero elements are last, this
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
index 65c48097bf9..16d34160508 100644
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ b/src/nnet2/nnet-compute-discriminative.cc
@@ -296,7 +296,7 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
 
   ScalePosterior(eg_.weight, &post);
 
-  double tot_num_post = 0.0, tot_den_post = 0.0;
+  double tot_num_post = 0.0;
   std::vector<MatrixElement<BaseFloat> > sv_labels;
   sv_labels.reserve(answers.size());
   for (int32 t = 0; t < post.size(); t++) {
@@ -304,7 +304,6 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
       int32 pdf_id = post[t][i].first;
       BaseFloat weight = post[t][i].second;
       if (weight > 0.0) { tot_num_post += weight; }
-      else { tot_den_post -= weight; }
       MatrixElement<BaseFloat> elem = {t, pdf_id, weight};
       sv_labels.push_back(elem);
     }
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index facbbb19be0..06278610553 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -673,11 +673,9 @@ void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) con
         vec.push_back(config_.num_frames[i]);
       if (j > 0)
         vec.push_back(config_.num_frames[j]);
-      int32 n = 0;
       while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
         if (!vec.empty()) // Don't allow the empty vector as a split.
           splits_set.insert(vec);
-        n++;
         vec.push_back(primary_length);
         std::sort(vec.begin(), vec.end());
       }
diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc
index 06157d0fcdf..615941f760a 100644
--- a/src/online2bin/apply-cmvn-online.cc
+++ b/src/online2bin/apply-cmvn-online.cc
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     
     
     BaseFloatMatrixWriter feature_writer(feature_wspecifier);
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     int64 tot_t = 0;
 
     if (spk2utt_rspecifier != "") {
@@ -82,7 +82,6 @@ int main(int argc, char *argv[]) {
           std::string utt = uttlist[i];
           if (!feature_reader.HasKey(utt)) {
             KALDI_WARN << "No features for utterance " << utt;
-            num_err++;
             continue;
           }
           const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc
index e697de6d15a..eafc0e64124 100644
--- a/src/online2bin/ivector-extract-online2.cc
+++ b/src/online2bin/ivector-extract-online2.cc
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(2),
         ivectors_wspecifier = po.GetArg(3);
 
-    double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
+    double tot_objf_impr = 0.0, tot_t = 0.0,
         tot_length = 0.0, tot_length_utt_end = 0.0;
     int32 num_done = 0, num_err = 0;
 
@@ -166,7 +166,6 @@ int main(int argc, char *argv[]) {
         }
         // Update diagnostics.
 
-        tot_ubm_loglike += T * ivector_feature.UbmLogLikePerFrame();
         tot_objf_impr += T * ivector_feature.ObjfImprPerFrame();
         tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0);
         for (int32 i = 0; i < num_ivectors; i++)
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 254d7ec36d8..cf88a408fcb 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -538,7 +538,6 @@ EventMap *SplitDecisionTree(const EventMap &input_map,
                             BaseFloat *obj_impr_out,
                             BaseFloat *smallest_split_change_out) {
   KALDI_ASSERT(num_leaves != NULL && *num_leaves > 0);  // can't be 0 or input_map would be empty.
-  int32 num_empty_leaves = 0;
   BaseFloat like_impr = 0.0;
   BaseFloat smallest_split_change = 1.0e+20;
   std::vector<DecisionTreeSplitter*> builders;
@@ -550,7 +549,6 @@ EventMap *SplitDecisionTree(const EventMap &input_map,
     builders.resize(split_stats.size());  // size == #leaves.
     for (size_t i = 0;i < split_stats.size();i++) {
       EventAnswerType leaf = static_cast<EventAnswerType>(i);
-      if (split_stats[i].size() == 0) num_empty_leaves++;
       builders[i] = new DecisionTreeSplitter(leaf, split_stats[i], q_opts);
     }
   }

From f785fa0b623c14f28bf01dc1bd12534a24ed78bf Mon Sep 17 00:00:00 2001
From: Roland Fehrenbacher <rf@q-leap.de>
Date: Mon, 29 Jan 2024 15:01:33 +0100
Subject: [PATCH 53/76] configure: Fix gcc version check for cuda

---
 src/configure | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/configure b/src/configure
index 37a75a5cade..82e4e3d9149 100755
--- a/src/configure
+++ b/src/configure
@@ -389,8 +389,8 @@ Either your CUDA is too new or too old."
           CUSOLVER=true
         ;;
         12_*)
-          MIN_UNSUPPORTED_GCC_VER="12.2"
-          MIN_UNSUPPORTED_GCC_VER_NUM=122000
+          MIN_UNSUPPORTED_GCC_VER="12.3"
+          MIN_UNSUPPORTED_GCC_VER_NUM=123000
           CUSOLVER=true
         ;;
         *)
@@ -399,9 +399,10 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
  output of either 'nvcc -h' or 'ptxas -h'."
           ;;
       esac
-      (( GCC_VER_NUM < MIN_UNSUPPORTED_GCC_VER_NUM )) ||
+      if [ $GCC_VER_NUM -ge $MIN_UNSUPPORTED_GCC_VER_NUM ]; then
         failure "CUDA $CUDA_VERSION does not support $CXX (g++-$GCC_VER).\
  Only versions strictly older than $MIN_UNSUPPORTED_GCC_VER are supported."
+      fi
 
       case $CUDA_VERSION in
         [1-8]_* | 9_0) CUSOLVER=false ;;

From a74256f1e1c9a9db4661b6a9ffd1900c8eed97d4 Mon Sep 17 00:00:00 2001
From: Paul Guyot <pguyot@kallisys.net>
Date: Tue, 30 Apr 2024 09:14:10 +0200
Subject: [PATCH 54/76] Workaround for macOS bug with strdup (fixes #4855)

Signed-off-by: Paul Guyot <pguyot@kallisys.net>
---
 tools/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/Makefile b/tools/Makefile
index 5099c60505a..951280d08f5 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -165,6 +165,11 @@ sph2pipe_v$(SPH2PIPE_VERSION)/Makefile: sph2pipe-$(SPH2PIPE_VERSION).tar.gz
 	rm -rf sph2pipe_v*
 	tar -xmzf sph2pipe-$(SPH2PIPE_VERSION).tar.gz
 	mv sph2pipe-$(SPH2PIPE_VERSION) sph2pipe_v$(SPH2PIPE_VERSION)
+	# Workaround for macOS bug <rdar://problem/19363342>
+	if [ `uname` = "Darwin" ]; then \
+		sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/sph2pipe.c ; \
+		sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/file_headers.c ; \
+	fi
 
 sph2pipe-$(SPH2PIPE_VERSION).tar.gz:
 	if [ -d "$(DOWNLOAD_DIR)" ]; then \

From a979f2a565b4771b5d197b19c3a1510c049b6f41 Mon Sep 17 00:00:00 2001
From: Paul Guyot <pguyot@kallisys.net>
Date: Tue, 30 Apr 2024 09:45:27 +0200
Subject: [PATCH 55/76] Patch sctk for macOS picky compiler

Signed-off-by: Paul Guyot <pguyot@kallisys.net>
---
 tools/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/Makefile b/tools/Makefile
index 5099c60505a..dbe932aa0e6 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -121,6 +121,8 @@ sclite sctk_made: sctk/.compiled
 
 sctk/.compiled: sctk
 	rm -f sctk/.compiled
+	sed -i -e '2s/^/#include <unistd.h>\n/' sctk/src/sclite/align.c
+	sed -i -e '99s/^/int TEXT_set_lang_prof(char *lprof);\n/' sctk/src/sclite/text.h
 	$(SCTK_MKENV) $(MAKE) -C sctk config
 	$(SCTK_MKENV) $(MAKE) -C sctk all doc
 	$(MAKE) -C sctk install

From f328393374b4d9c99fafe09fc793e251096ff355 Mon Sep 17 00:00:00 2001
From: danijel3 <danijel@pja.edu.pl>
Date: Sun, 2 Jun 2024 23:11:33 +0200
Subject: [PATCH 56/76] Fix missing FLT_MAX in some CUDA installation
 scenarios.

---
 src/cudadecoder/cuda-decoder-kernels.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index 8503182c1f8..e20a7dea15c 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -26,6 +26,10 @@
 #include "cuda-decoder-kernels.h"
 #include "cuda-decoder-kernels-utils.h"
 
+#ifndef FLT_MAX
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+#endif
+
 namespace kaldi {
 namespace cuda_decoder {
 

From c4515b07669a9d9e372fdb906c244961e91a32a9 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 22 Jul 2024 13:42:31 +0200
Subject: [PATCH 57/76] Fix reported issues w.r.t python2.7 and some apple
 silicone quirks

---
 tools/extras/check_dependencies.sh | 77 ++++++++++++++++++------------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 155a376b6e6..12504104b9a 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -82,61 +82,73 @@ if ! have libtoolize && ! have glibtoolize; then
   add_packages libtool
 fi
 
-if ! have svn; then
-  echo "$0: subversion is not installed"
-  add_packages subversion
-fi
-
 if ! have awk; then
   echo "$0: awk is not installed"
   add_packages gawk
 fi
 
-pythonok=true
+pythonok=false
+python3=false
+python27=false
+
+
 if ! have python2.7; then
   echo "$0: python2.7 is not installed"
-  add_packages python27 python2.7
-  pythonok=false
+else
+  echo "$0: python2.7 present"
+  python27=true
+  pythonok=true
 fi
 
 if ! have python3; then
   echo "$0: python3 is not installed"
   add_packages python3
-  pythonok=false
+else
+  echo "$0: python3 present"
+  python3=true
+  pythonok=true
 fi
 
 (
 #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script
 [ -f ./env.sh ] && . ./env.sh
-if $pythonok && ! have python2; then
-  mkdir -p $PWD/python
-  echo "$0: python2.7 is installed, but the python2 binary does not exist." \
-       "Creating a symlink and adding this to tools/env.sh"
-  ln -s $(command -v python2.7) $PWD/python/python2
-  echo "export PATH=$PWD/python:\${PATH}" >> env.sh
-fi
-
-if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then
-  rm $PWD/python/python
-fi
-
-if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then
-  version=$(python 2>&1 --version | awk '{print $2}')
-  if [[ $version != "2.7"* ]] ; then
-    echo "$0: WARNING python 2.7 is not the default python. We fixed this by" \
-         "adding a correct symlink more prominently on the path."
-    echo " ... If you really want to use python $version as default, add an" \
+rm -f $PWD/python/python*
+if ! [ -f $PWD/python/.use_default_python ]; then
+  echo "$0: Configuring python"
+  echo "$0: ... If you really want to avoid this, add an" \
          "empty file $PWD/python/.use_default_python and run this script again."
-    mkdir -p $PWD/python
+  if $python27 ; then
+    echo "$0: ... python2.7 found, making it default (python, python2, python2.7)"
     ln -s $(command -v python2.7) $PWD/python/python
-    echo "export PATH=$PWD/python:\${PATH}" >> env.sh
+    ln -s $(command -v python2.7) $PWD/python/python2
+    ln -s $(command -v python2.7) $PWD/python/python2.7
+  fi
+
+  if $python3 ; then
+    echo "$0: ... python3 found, making symlink (python3)"
+    ln -s $(command -v python3) $PWD/python/python3
+    if ! $python27 ; then
+      echo "$0: ... ... python2.7 not found, using python3 as python"
+      ln -s $(command -v python3) $PWD/python/python
+    fi
+  fi
+else
+  echo "$0: Not configuring python(s) -- using system defaults"
+  if ! have python ; then
+    echo "$0: WARNING: 'python' executable not present, configuring"
+    if $python27 ; then
+      ln -s $(command -v python2.7) $PWD/python/python
+    elif $python3 ; then
+      ln -s $(command -v python3) $PWD/python/python
+    fi
   fi
 fi
+
 )
 
 mathlib_missing=false
-case $(uname -m) in
-  x86_64)  # Suggest MKL on an Intel64 system (not supported on i?86 hosts).
+case "$(uname -m)-$(uname -s)" in
+  x86_64*)  # Suggest MKL on an Intel64 system (not supported on i?86 hosts).
     # Respect user-supplied MKL_ROOT environment variable.
     MKL_ROOT="${MKL_ROOT:-/opt/intel/mkl}"
        # Check the well-known mkl.h file location.
@@ -155,6 +167,9 @@ case $(uname -m) in
       mathlib_missing=true
     fi
       ;;
+  arm64-Darwin)  ## Apple Silicon
+    echo "$0: Relying on Acceleration framework"
+    ;;
   *)  # Suggest OpenBLAS on other hardware.
     if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] &&
          ! echo '#include <openblas_config.h>' |

From ed29e165ecb50698abd241129fe1f909953f6375 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Mon, 22 Jul 2024 14:54:49 +0200
Subject: [PATCH 58/76] catch exception by reference so that compiler does not
 complain

---
 src/util/parse-options-test.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc
index af1fcc00880..ec7491ad9cb 100644
--- a/src/util/parse-options-test.cc
+++ b/src/util/parse-options-test.cc
@@ -120,7 +120,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My boolean");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -144,7 +144,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My string");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -186,7 +186,7 @@ void UnitTestParseOptions() {
     po4.Read(argc4, argv4);
     KALDI_ASSERT(val == "bar");
   }
-  
+
   try {   // test error with --float=string
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option=foo"};
@@ -195,7 +195,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My float");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -208,7 +208,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -220,7 +220,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -232,7 +232,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected)xxx.";
   }
 
@@ -244,7 +244,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My bool");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -258,7 +258,7 @@ void UnitTestParseOptions() {
     po4.Register("num", &num, "My int32 variable");
     po4.Read(argc4, argv4);
     KALDI_ASSERT(num == 0);
-  } catch(std::exception e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 

From 4145e17c12da9e4468a07b3895c84907c131ad6e Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Tue, 23 Jul 2024 11:32:37 +0200
Subject: [PATCH 59/76] fix tests and address comments

---
 src/util/kaldi-table-test.cc   | 40 ++++++++++++++++++++++------------
 src/util/parse-options-test.cc | 16 +++++++-------
 src/util/stl-utils-test.cc     |  6 ++---
 3 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc
index 358e33e686a..3613e44fc76 100644
--- a/src/util/kaldi-table-test.cc
+++ b/src/util/kaldi-table-test.cc
@@ -351,7 +351,8 @@ void UnitTestTableSequentialInt32(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -384,7 +385,8 @@ void UnitTestTableSequentialBool(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -418,7 +420,8 @@ void UnitTestTableSequentialDouble(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     KALDI_ASSERT(v2 == v);
@@ -462,7 +465,8 @@ void UnitTestTableSequentialDoubleBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     KALDI_ASSERT(v2 == v);
@@ -511,7 +515,8 @@ void UnitTestTableSequentialInt32VectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -551,7 +556,8 @@ void UnitTestTableSequentialInt32PairVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -594,7 +600,8 @@ void UnitTestTableSequentialInt32VectorVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -641,7 +648,8 @@ void UnitTestTableSequentialInt32Script(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
 
   unlink("tmp.scp");
   for (size_t i = 0; i < script.size(); i++) {
@@ -684,7 +692,8 @@ void UnitTestTableSequentialDoubleMatrixBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(new Matrix<double>(sbr.Value()));
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     for (size_t i = 0; i < v2.size(); i++)
@@ -738,7 +747,8 @@ void UnitTestTableSequentialBaseFloatVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(new Vector<BaseFloat>(sbr.Value()));
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     for (size_t i = 0; i < v2.size(); i++)
@@ -831,10 +841,11 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
       }
+      auto v2 = sbr.Value(cur_key);
       if (binary) {
-        KALDI_ASSERT(value == sbr.Value(cur_key));
+        KALDI_ASSERT(value == v2);
       } else {
-        KALDI_ASSERT(ApproxEqual(value, sbr.Value(cur_key)));
+        KALDI_ASSERT(ApproxEqual(value, v2));
       }
     }
   }
@@ -1039,10 +1050,11 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
       }
+      auto v2 = sbr.Value(cur_key);
       if (binary) {
-        KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 1.0e-10));
+        KALDI_ASSERT(value_ptr->ApproxEqual(v2, 1.0e-10));
       } else {
-        KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 0.01));
+        KALDI_ASSERT(value_ptr->ApproxEqual(v2, 0.01));
       }
     }
   }
diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc
index ec7491ad9cb..b242130b8c7 100644
--- a/src/util/parse-options-test.cc
+++ b/src/util/parse-options-test.cc
@@ -120,7 +120,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My boolean");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -144,7 +144,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My string");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -195,7 +195,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My float");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -208,7 +208,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -220,7 +220,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -232,7 +232,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected)xxx.";
   }
 
@@ -244,7 +244,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My bool");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -258,7 +258,7 @@ void UnitTestParseOptions() {
     po4.Register("num", &num, "My int32 variable");
     po4.Read(argc4, argv4);
     KALDI_ASSERT(num == 0);
-  } catch(std::exception &e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
diff --git a/src/util/stl-utils-test.cc b/src/util/stl-utils-test.cc
index 11781e2f938..3a54fc82bac 100644
--- a/src/util/stl-utils-test.cc
+++ b/src/util/stl-utils-test.cc
@@ -148,9 +148,9 @@ void TestCopyMapValuesToVector() {
     CopyMapValuesToVector(mp, &v);
     KALDI_ASSERT(mp.size() == v.size());
     int i = 0;
-    for (std::map<int, int>::iterator iter = mp.begin(); iter != mp.end();
-         iter++) {
-      KALDI_ASSERT(v[i++] == iter->second);
+    for (auto iter = mp.begin(); iter != mp.end();
+         iter++, i++) {
+      KALDI_ASSERT(v[i] == iter->second);
     }
   }
 }

From 38ea2b1c924d2a5a623aaac700b9ee3f3ab0e953 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Tue, 23 Jul 2024 13:55:12 +0200
Subject: [PATCH 60/76] disable warning about unused flags msse and msse on
 Apple Silicon

---
 src/configure                 |  2 ++
 src/makefiles/darwin_arm64.mk | 36 +++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 src/makefiles/darwin_arm64.mk

diff --git a/src/configure b/src/configure
index 82e4e3d9149..f55e320ff97 100755
--- a/src/configure
+++ b/src/configure
@@ -1150,6 +1150,8 @@ elif [ "`uname`" == "Darwin" ]; then
       cat makefiles/darwin_clapack.mk >> kaldi.mk
       echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
       echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT"
+    elif [ "`uname -m`" == "arm64" ]; then
+      cat makefiles/darwin_arm64.mk >> kaldi.mk
     else
       cat makefiles/darwin.mk >> kaldi.mk
     fi
diff --git a/src/makefiles/darwin_arm64.mk b/src/makefiles/darwin_arm64.mk
new file mode 100644
index 00000000000..149a3d97118
--- /dev/null
+++ b/src/makefiles/darwin_arm64.mk
@@ -0,0 +1,36 @@
+# Darwin (macOS) configuration
+
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
+           -pthread \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+else ifeq ($(findstring GCC,$(COMPILER)),GCC)
+# Allow implicit conversions between vectors.
+CXXFLAGS += -flax-vector-conversions
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl

From 3cf3c1a72caec64d01e00546dfdac9a33c805641 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Tue, 23 Jul 2024 14:53:59 +0200
Subject: [PATCH 61/76] do a full cleanup on apple silicon

---
 src/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index bc4375e30f6..5036d12b707 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -62,14 +62,16 @@ endif
 
 # Don't call rm -rf.
 rmlibdir:
+ifeq ($(KALDI_FLAVOR), dynamic)
 ifneq ($(KALDILIBDIR), )
-	-rm -f $(KALDILIBDIR)/*{.so,.a,.o}
+	-rm -f $(KALDILIBDIR)/*{.so,.a,.o,.dylib}
 	-rmdir 2>/dev/null $(KALDILIBDIR); true
 else
 # KALDILIBDIR might have been unset because of reconfigure. Do a best guess.
 	@echo "Something seems wrong. Please re-run configure."
 	@echo "I will continue but the cleanup might not be complete."
 endif
+endif
 
 kaldi.mk:
 	@echo "ERROR: kaldi.mk does not exist; run ./configure first.";

From 770daa212c5cddb1a559e2ac0906eaf3511b594d Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Tue, 23 Jul 2024 16:07:03 +0200
Subject: [PATCH 62/76] improve compatibility with C++ standard, esp. C++20

---
 src/cudamatrix/cu-array.h     | 12 ++++++------
 src/cudamatrix/cu-matrix.h    | 22 +++++++++++-----------
 src/cudamatrix/cu-tp-matrix.h | 16 ++++++++--------
 src/cudamatrix/cu-vector.h    |  2 +-
 src/matrix/kaldi-matrix.h     |  4 ++--
 src/matrix/qr.cc              | 26 +++++++++++++-------------
 6 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h
index 84f78f00a91..aaaddad75c8 100644
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@@ -105,7 +105,7 @@ class CuArrayBase {
  protected:
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
-  CuArrayBase<T>(): data_(NULL), dim_(0) { }
+  CuArrayBase(): data_(NULL), dim_(0) { }
 
 
   T *data_;  ///< GPU data pointer (if GPU not available,
@@ -126,19 +126,19 @@ class CuArray: public CuArrayBase<T> {
 
   /// Default constructor, initialized data_ to NULL and dim_ to 0 via
   /// constructor of CuArrayBase.
-  CuArray<T>() { }
+  CuArray() { }
 
   /// Constructor with memory initialisation.  resize_type may be kSetZero or
   /// kUndefined.
-  explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero)
+  explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero)
      { Resize(dim, resize_type); }
 
   /// Constructor from CPU-based int vector
-  explicit CuArray<T>(const std::vector<T> &src) { CopyFromVec(src); }
+  explicit CuArray(const std::vector<T> &src) { CopyFromVec(src); }
 
   /// Copy constructor.  We don't make this explicit because we want to be able
   /// to create a std::vector<CuArray>.
-  CuArray<T>(const CuArray<T> &src) { CopyFromArray(src); }
+  CuArray(const CuArray<T> &src) { CopyFromArray(src); }
 
   /// Destructor
   ~CuArray() { Destroy(); }
@@ -182,7 +182,7 @@ class CuSubArray: public CuArrayBase<T> {
   /// Constructor as a range of an existing CuArray or CuSubArray.  Note: like
   /// similar constructors in class CuVector and others, it can be used to evade
   /// 'const' constraints; don't do that.
-  explicit CuSubArray<T>(const CuArrayBase<T> &src,
+  explicit CuSubArray(const CuArrayBase<T> &src,
                          MatrixIndexT offset, MatrixIndexT dim);
 
   /// Construct from raw pointers
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 3ffe67d8b06..775fecd82c6 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -250,7 +250,7 @@ class CuMatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
-  
+
   // This function will copy from source rows (start_range, end_range]
   // if the range is outside of the clamped region then the clamped
   // row will be replicated across the out of range areas
@@ -307,9 +307,9 @@ class CuMatrixBase {
   void PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign=false);
 
   void Floor(const CuMatrixBase<Real> &src, Real floor_val);
-  
+
   void Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val);
-  
+
   /// This is equivalent to running:
   /// Floor(src, lower_limit);
   /// Ceiling(src, upper_limit);
@@ -320,7 +320,7 @@ class CuMatrixBase {
   /// (x < 0 ? exp(x) : x + 1).  This function is used
   /// in our RNNLM training.
   void ExpSpecial(const CuMatrixBase<Real> &src);
-  
+
   /// Softmax nonlinearity
   /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
   /// with attention to avoiding  overflow or underflow.
@@ -333,7 +333,7 @@ class CuMatrixBase {
   /// Supports in-place operation (i.e. this == &src).
   void LogSoftMaxPerRow(const CuMatrixBase<Real> &src);
 
-  
+
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -439,23 +439,23 @@ class CuMatrixBase {
     this -> Pow(*this, power);
   };
 
-  
+
   inline void ApplyPowAbs(Real power, bool include_sign=false) {
     this -> PowAbs(*this, power, include_sign);
   };
-  
+
   inline void ApplyHeaviside() {
     this -> Heaviside(*this);
   };
-  
+
   inline void ApplyFloor(Real floor_val) {
     this -> Floor(*this, floor_val);
   };
-  
+
   inline void ApplyCeiling(Real ceiling_val) {
     this -> Ceiling(*this, ceiling_val);
   };
-  
+
   inline void ApplyExp() {
     this -> Exp(*this);
   };
@@ -924,7 +924,7 @@ class CuSubMatrix: public CuMatrixBase<Real> {
 
   /// This type of constructor is needed for Range() to work [in CuMatrix base
   /// class]. Cannot make it explicit or that breaks.
-  inline CuSubMatrix<Real> (const CuSubMatrix &other):
+  inline CuSubMatrix(const CuSubMatrix &other):
   CuMatrixBase<Real> (other.data_, other.num_rows_, other.num_cols_,
                       other.stride_) {}
  private:
diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h
index 8de46ec46f5..4219467f615 100644
--- a/src/cudamatrix/cu-tp-matrix.h
+++ b/src/cudamatrix/cu-tp-matrix.h
@@ -48,18 +48,18 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   CuTpMatrix() : CuPackedMatrix<Real>() {}
   explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
       : CuPackedMatrix<Real>(r, resize_type) {}
-  
-  explicit CuTpMatrix<Real>(const TpMatrix<Real> &orig)
+
+  explicit CuTpMatrix(const TpMatrix<Real> &orig)
       : CuPackedMatrix<Real>(orig) {}
   // This constructor lacks the "explicit" keyword so that
   // we can include this class in std::vector.
-  CuTpMatrix<Real>(const CuTpMatrix<Real> &orig)
+  CuTpMatrix(const CuTpMatrix<Real> &orig)
       : CuPackedMatrix<Real>(orig) {}
-  
-  explicit CuTpMatrix<Real>(const CuMatrixBase<Real> &orig,
+
+  explicit CuTpMatrix(const CuMatrixBase<Real> &orig,
                             MatrixTransposeType trans = kNoTrans);
 
-  
+
   ~CuTpMatrix() {}
 
   void CopyFromMat(const CuMatrixBase<Real> &M,
@@ -70,12 +70,12 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   }
   void CopyFromTp(const TpMatrix<Real> &other) {
     CuPackedMatrix<Real>::CopyFromPacked(other);
-  }  
+  }
   void Cholesky(const CuSpMatrix<Real>& Orig);
   void Invert();
 
   CuTpMatrix<Real> &operator = (const CuTpMatrix<Real> &in);
-  
+
  protected:
   inline const TpMatrix<Real> &Mat() const {
     return *(reinterpret_cast<const TpMatrix<Real>* >(this));
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index f1c32756887..82e1fb47fcb 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -243,7 +243,7 @@ class CuVectorBase {
 
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
-  CuVectorBase<Real>(): data_(NULL), dim_(0) { }
+  CuVectorBase(): data_(NULL), dim_(0) { }
 
   Real *data_; ///< GPU data pointer (or regular data pointer
                ///< if CUDA is not compiled in or we have no GPU).
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 064edf4237b..b44bf1d934f 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -1006,11 +1006,11 @@ class SubMatrix : public MatrixBase<Real> {
             MatrixIndexT num_cols,
             MatrixIndexT stride);
 
-  ~SubMatrix<Real>() {}
+  ~SubMatrix() {}
 
   /// This type of constructor is needed for Range() to work [in Matrix base
   /// class]. Cannot make it explicit.
-  SubMatrix<Real> (const SubMatrix &other):
+  SubMatrix(const SubMatrix &other):
   MatrixBase<Real> (other.data_, other.num_cols_, other.num_rows_,
                     other.stride_) {}
 
diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc
index 861dead05ba..db1b7359de9 100644
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@@ -57,7 +57,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
     if (max_x == 0.0) max_x = 1.0;
     s = 1.0 / max_x;
   }
-  
+
   Real sigma = 0.0;
   v[0] = 1.0;
   for (MatrixIndexT i = 1; i < dim; i++) {
@@ -73,7 +73,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
       v[0] = x1 - mu;
     } else {
       v[0] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));      
+      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));
     }
     Real v1 = v[0];
     Real v1sq = v1 * v1;
@@ -155,11 +155,11 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    with packed lower-triangular matrices to do it this way.  There's also
    a shift from one-based to zero-based indexing, so the index
    k is transformed k -> n - k, and a corresponding transpose...
-   
+
    Let the original *this be A.  This algorithms replaces *this with
    a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q.
    Caution: Q is transposed vs. Golub and Van Loan.
-   If Q != NULL it outputs Q. 
+   If Q != NULL it outputs Q.
 */
 template<typename Real>
 void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
@@ -195,7 +195,7 @@ void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
     if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this
       // case we apply them in the opposite order so it's H_n-1 .. H_1,
       // but also Q is transposed so we really have Q = H_1 .. H_n-1.
-      // It's a double negative.    
+      // It's a double negative.
       // Anyway, we left-multiply Q by each one.  The H_n would each be
       // diag(I + beta v v', I) but we don't ever touch the last dims.
       // We do (in Matlab notation):
@@ -309,7 +309,7 @@ void QrStep(MatrixIndexT n,
     if (k < n-2) {
       // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again
       // backwards.
-      Real &elem_kp2_k = z, 
+      Real &elem_kp2_k = z,
           &elem_kp2_kp1 = off_diag[k+1];
       // Note: elem_kp2_k == z would start off as zero because it's
        // two off the diagonal, and not been touched yet.  Therefore
@@ -338,7 +338,7 @@ void QrInternal(MatrixIndexT n,
   MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters.
       large_iters = 100 + 2*n;
   Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-  
+
   for (; counter < max_iters; counter++) { // this takes the place of "until
                                            // q=n"... we'll break out of the
                                            // loop when we converge.
@@ -356,7 +356,7 @@ void QrInternal(MatrixIndexT n,
         off_diag[i] = 0.0;
     }
     // The next code works out p, q, and npq which is n - p - q.
-    // For the definitions of q and p, see Golub and Van Loan; we 
+    // For the definitions of q and p, see Golub and Van Loan; we
     // partition the n dims into pieces of size (p, n-p-q, q) where
     // the part of size q is diagonal and the part of size n-p-p is
     // "unreduced", i.e. has no zero off-diagonal elements.
@@ -392,7 +392,7 @@ void QrInternal(MatrixIndexT n,
     } else {
       QrStep(npq, diag + p, off_diag + p,
              static_cast<MatrixBase<Real>*>(NULL));
-    }      
+    }
   }
   if (counter == max_iters) {
     KALDI_WARN << "Failure to converge in QR algorithm. "
@@ -490,7 +490,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     r.AddSpVec(1.0, S, Q.Row(d), 0.0);
     // r = S * q_d
     MatrixIndexT counter = 0;
-    Real end_prod;
+    Real end_prod = 0;
     while (1) { // Normally we'll do this loop only once:
       // we repeat to handle cases where r gets very much smaller
       // and we want to orthogonalize again.
@@ -528,11 +528,11 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     }
   }
 
-  Matrix<Real> R(lanczos_dim, lanczos_dim);  
+  Matrix<Real> R(lanczos_dim, lanczos_dim);
   R.SetUnit();
   T.Qr(&R); // Diagonalizes T.
   Vector<Real> s_tmp(lanczos_dim);
-  s_tmp.CopyDiagFromSp(T);  
+  s_tmp.CopyDiagFromSp(T);
 
   // Now T = R * diag(s_tmp) * R^T.
   // The next call sorts the elements of s from greatest to least absolute value,
@@ -544,7 +544,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
   SubMatrix<Real> Rsub(R, 0, eig_dim, 0, lanczos_dim);
   SubVector<Real> s_sub(s_tmp, 0, eig_dim);
   s->CopyFromVec(s_sub);
-      
+
   // For working out what to do now, just assume the other eigenvalues were
   // zero.  This is just for purposes of knowing how to get the result, and
   // not getting things wrongly transposed.

From 816d438453fa65e90139676bb932ad06229421f5 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Tue, 23 Jul 2024 16:15:12 +0200
Subject: [PATCH 63/76] make codefactor happier

---
 src/configure             | 2 +-
 src/cudamatrix/cu-array.h | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/configure b/src/configure
index f55e320ff97..c1e44512af9 100755
--- a/src/configure
+++ b/src/configure
@@ -1150,7 +1150,7 @@ elif [ "`uname`" == "Darwin" ]; then
       cat makefiles/darwin_clapack.mk >> kaldi.mk
       echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
       echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT"
-    elif [ "`uname -m`" == "arm64" ]; then
+    elif [ "$(uname -m)" == "arm64" ]; then
       cat makefiles/darwin_arm64.mk >> kaldi.mk
     else
       cat makefiles/darwin.mk >> kaldi.mk
diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h
index aaaddad75c8..3db44bf4aa5 100644
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@@ -111,7 +111,6 @@ class CuArrayBase {
   T *data_;  ///< GPU data pointer (if GPU not available,
              ///< will point to CPU memory).
   MatrixIndexT dim_;     ///< dimension of the vector
-
 };
 
 /**
@@ -123,7 +122,6 @@ class CuArrayBase {
 template<typename T>
 class CuArray: public CuArrayBase<T> {
  public:
-
   /// Default constructor, initialized data_ to NULL and dim_ to 0 via
   /// constructor of CuArrayBase.
   CuArray() { }
@@ -172,7 +170,6 @@ class CuArray: public CuArrayBase<T> {
   /// I/O
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &is, bool binary) const;
-
 };
 
 

From 3f170d4d78584dfaf584d21ecddb020a73055853 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Thu, 23 Feb 2023 21:23:57 +0300
Subject: [PATCH 64/76] Support openfst-1.7.6

---
 src/chain/chain-supervision.cc   | 10 ++++------
 src/fstext/fstext-utils-inl.h    | 12 ++++++------
 src/fstext/kaldi-fst-io-inl.h    |  2 +-
 src/fstext/pre-determinize-inl.h |  4 ++--
 src/kws/kws-functions.cc         |  2 +-
 src/lat/kaldi-lattice.cc         |  4 ++--
 6 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index f8a2c1d11cc..b29000a448c 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -571,9 +571,8 @@ void Supervision::Write(std::ostream &os, bool binary) const {
       // Write using StdAcceptorCompactFst, making use of the fact that it's an
       // acceptor.
       fst::FstWriteOptions write_options("<unknown>");
-      fst::StdCompactAcceptorFst::WriteFst(
-          fst, fst::AcceptorCompactor<fst::StdArc>(), os,
-          write_options);
+      fst::StdCompactAcceptorFst cfst(fst);
+      cfst.Write(os, write_options);
     }
   } else {
     KALDI_ASSERT(e2e_fsts.size() == num_sequences);
@@ -586,9 +585,8 @@ void Supervision::Write(std::ostream &os, bool binary) const {
         // Write using StdAcceptorCompactFst, making use of the fact that it's an
         // acceptor.
         fst::FstWriteOptions write_options("<unknown>");
-        fst::StdCompactAcceptorFst::WriteFst(
-            e2e_fsts[i], fst::AcceptorCompactor<fst::StdArc>(), os,
-            write_options);
+        fst::StdCompactAcceptorFst cfst(e2e_fsts[i]);
+        cfst.Write(os, write_options);
       }
     }
     WriteToken(os, binary, "</Fsts>");
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 853697387b9..d877c03e1ae 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -374,12 +374,12 @@ void GetSymbols(const SymbolTable &symtab,
                 std::vector<I> *syms_out) {
   KALDI_ASSERT(syms_out != NULL);
   syms_out->clear();
-  for (SymbolTableIterator iter(symtab);
-      !iter.Done();
-      iter.Next()) {
-    if (include_eps || iter.Value() != 0) {
-      syms_out->push_back(iter.Value());
-      KALDI_ASSERT(syms_out->back() == iter.Value());  // an integer-range thing.
+  for (SymbolTable::iterator iter = symtab.begin();
+      iter != symtab.end();
+      ++iter) {
+    if (include_eps || iter->Label() != 0) {
+      syms_out->push_back(iter->Label());
+      KALDI_ASSERT(syms_out->back() == iter->Label());  // an integer-range thing.
     }
   }
 }
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index b6bae4b9dc9..f7bb3a7c2b5 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -44,7 +44,7 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     bool acceptor = false, write_one = false;
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
-    printer.Print(&os, "<unknown>");
+    printer.Print(os, "<unknown>");
     if (os.fail())
       KALDI_ERR << "Stream failure detected writing FST to stream";
     // Write another newline as a terminating character.  The read routine will
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index ea6608ce38a..7c1b544da4c 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -235,8 +235,8 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix,
   assert(symTable != NULL);
   const char *prefix_ptr = prefix.c_str();
   size_t prefix_len = strlen(prefix_ptr);  // allowed to be zero but not encouraged.
-  for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) {
-    const std::string &sym = siter.Symbol();
+  for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) {
+    const std::string &sym = siter->Symbol();
     if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) {  // has prefix.
       if (isdigit(sym[prefix_len])) {  // we don't allow prefix followed by a digit, as a symbol.
         // Has at least one digit.
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index d1d71ce7a42..3e27226f13c 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -75,7 +75,7 @@ bool ClusterLattice(CompactLattice *clat,
   unordered_map<StateId, std::vector<Interval> >::iterator iter;
   for (iter = head.begin(); iter != head.end(); ++iter) {
     // For this ilabel, sort all the arcs on time, from first to last.
-    sort(iter->second.begin(), iter->second.end(), CompareInterval);
+    std::sort(iter->second.begin(), iter->second.end(), CompareInterval);
     std::vector<Interval> tmp;
     tmp.push_back(iter->second[0]);
     for (int32 i = 1; i < iter->second.size(); i++) {
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index 744cc538462..648e67115b7 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -78,7 +78,7 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
-    printer.Print(&os, "<unknown>");
+    printer.Print(os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
@@ -403,7 +403,7 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
-    printer.Print(&os, "<unknown>");
+    printer.Print(os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will

From e460d8a921209190568f6e5d11226d00377034d8 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Thu, 23 Feb 2023 21:24:12 +0300
Subject: [PATCH 65/76] Support openfst-1.8.0

---
 src/fstext/fstext-utils-inl.h  | 2 +-
 src/fstext/fstext-utils.h      | 2 +-
 src/fstext/lattice-utils-inl.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index d877c03e1ae..44e641a3f20 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -163,7 +163,7 @@ void RemoveSomeInputSymbols(const std::vector<I> &to_remove,
                             MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   RemoveSomeInputSymbolsMapper<Arc, I> mapper(to_remove);
-  Map(fst, mapper);
+  ArcMap(fst, mapper);
 }
 
 template<class Arc, class I>
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 5789dbe7cc3..db14ddd3576 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -113,7 +113,7 @@ void PushInLog(VectorFst<StdArc> *fst, uint32 ptype, float delta = kDelta) {
 template<class Arc>
 void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
 
-  Map(fst, QuantizeMapper<Arc>(delta));
+  ArcMap(fst, QuantizeMapper<Arc>(delta));
   EncodeMapper<Arc> encoder(kEncodeLabels | kEncodeWeights, ENCODE);
   Encode(fst, &encoder);
   internal::AcceptorMinimize(fst);
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index c97a538dd1d..5d52ed3aa5a 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -268,7 +268,7 @@ void ConvertFstToLattice(
     MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
   int32 num_states_cache = 50000;
   fst::CacheOptions cache_opts(true, num_states_cache);
-  fst::MapFstOptions mapfst_opts(cache_opts);
+  fst::ArcMapFstOptions mapfst_opts(cache_opts);
   StdToLatticeMapper<Real> mapper;
   MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
          StdToLatticeMapper<Real> > map_fst(ifst, mapper, mapfst_opts);

From 5ccce55e04a59f0e5d0e400ef31f008e5940fa07 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Thu, 23 Feb 2023 21:24:18 +0300
Subject: [PATCH 66/76] Support openfst-1.8.1

---
 src/fstext/kaldi-fst-io-inl.h                 |  2 +-
 src/fstext/lattice-weight.h                   | 16 ++++++++--------
 src/lat/kaldi-lattice.cc                      |  2 +-
 src/lat/lattice-functions-transition-model.cc |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index f7bb3a7c2b5..01047919c22 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -99,7 +99,7 @@ void ReadFstKaldi(std::istream &is, bool binary,
     fst->DeleteStates();
     string line;
     size_t nline = 0;
-    string separator = FLAGS_fst_field_separator + "\r\n";
+    string separator = FST_FLAGS_fst_field_separator + "\r\n";
     while (std::getline(is, line)) {
       nline++;
       vector<string> col;
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 6e7737a195d..f03ed702588 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -396,8 +396,8 @@ inline bool ApproxEqual(const LatticeWeightTpl<FloatType> &w1,
 template <class FloatType>
 inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl<FloatType> &w) {
   LatticeWeightTpl<FloatType>::WriteFloatType(strm, w.Value1());
-  CHECK(FLAGS_fst_weight_separator.size() == 1);
-  strm << FLAGS_fst_weight_separator[0]; // comma by default;
+  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
+  strm << FST_FLAGS_fst_weight_separator[0]; // comma by default;
   // may or may not be settable from Kaldi programs.
   LatticeWeightTpl<FloatType>::WriteFloatType(strm, w.Value2());
   return strm;
@@ -405,9 +405,9 @@ inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl<Floa
 
 template <class FloatType>
 inline std::istream &operator >>(std::istream &strm, LatticeWeightTpl<FloatType> &w1) {
-  CHECK(FLAGS_fst_weight_separator.size() == 1);
+  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
   // separator defaults to ','
-  return w1.ReadNoParen(strm, FLAGS_fst_weight_separator[0]);
+  return w1.ReadNoParen(strm, FST_FLAGS_fst_weight_separator[0]);
 }
 
 
@@ -726,8 +726,8 @@ inline CompactLatticeWeightTpl<WeightType, IntType> Divide(const CompactLatticeW
 template <class WeightType, class IntType>
 inline std::ostream &operator <<(std::ostream &strm, const CompactLatticeWeightTpl<WeightType, IntType> &w) {
   strm << w.Weight();
-  CHECK(FLAGS_fst_weight_separator.size() == 1);
-  strm << FLAGS_fst_weight_separator[0]; // comma by default.
+  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
+  strm << FST_FLAGS_fst_weight_separator[0]; // comma by default.
   for(size_t i = 0; i < w.String().size(); i++) {
     strm << w.String()[i];
     if (i+1 < w.String().size())
@@ -743,8 +743,8 @@ inline std::istream &operator >>(std::istream &strm, CompactLatticeWeightTpl<Wei
   if (strm.fail()) {
     return strm;
   }
-  CHECK(FLAGS_fst_weight_separator.size() == 1);
-  size_t pos = s.find_last_of(FLAGS_fst_weight_separator); // normally ","
+  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
+  size_t pos = s.find_last_of(FST_FLAGS_fst_weight_separator); // normally ","
   if (pos == std::string::npos) {
     strm.clear(std::ios::badbit);
     return strm;
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index 648e67115b7..70fde5acfa0 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -114,7 +114,7 @@ class LatticeReader {
     CompactLattice *cfst = new CompactLattice();
     string line;
     size_t nline = 0;
-    string separator = FLAGS_fst_field_separator + "\r\n";
+    string separator = FST_FLAGS_fst_field_separator + "\r\n";
     while (std::getline(is, line)) {
       nline++;
       vector<string> col;
diff --git a/src/lat/lattice-functions-transition-model.cc b/src/lat/lattice-functions-transition-model.cc
index 6172610dca0..a8cd7b7e2dd 100644
--- a/src/lat/lattice-functions-transition-model.cc
+++ b/src/lat/lattice-functions-transition-model.cc
@@ -248,13 +248,13 @@ bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
   int32 num_paths = 5, seed = Rand(), max_path_length = -1;
   BaseFloat delta = 0.2; // some lattices have large costs -> use large delta.
 
-  FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi
+  FST_FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi
                                // verbose level.
   if (!RandEquivalent(clat, aligned_clat, num_paths, delta, seed, max_path_length)) {
     KALDI_WARN << "Equivalence test failed during lattice alignment.";
     return false;
   }
-  FLAGS_v = 0;
+  FST_FLAGS_v = 0;
 
   return (num_err == 0);
 }

From 9a213bed4d9a89e7134b3bedf7fea16538b90382 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.com>
Date: Thu, 23 Feb 2023 21:24:25 +0300
Subject: [PATCH 67/76] Support openfst-1.8.2

---
 src/base/kaldi-types.h                | 43 +++++++--------------------
 src/fstext/lattice-utils-inl.h        |  2 +-
 src/kws/kws-functions.cc              |  4 +--
 src/kws/kws-functions2.cc             |  2 +-
 src/lat/arctic-weight.h               |  2 +-
 src/lat/determinize-lattice-pruned.cc |  6 ++--
 src/lat/minimize-lattice.cc           |  2 +-
 src/lat/push-lattice.cc               |  4 +--
 src/lat/sausages.cc                   |  2 +-
 src/nnet3/nnet-batch-compute.cc       |  2 +-
 src/online/online-tcp-source.cc       |  2 +-
 src/rnnlm/rnnlm-test-utils.cc         |  2 +-
 src/tree/tree-renderer.cc             |  4 +--
 13 files changed, 28 insertions(+), 49 deletions(-)

diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
index 7ebf4f85386..68d5578a5fb 100644
--- a/src/base/kaldi-types.h
+++ b/src/base/kaldi-types.h
@@ -39,37 +39,16 @@ typedef float   BaseFloat;
 // we find in the future lacks stdint.h
 #include <stdint.h>
 
-// for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
-#include <fst/types.h>
-
-namespace kaldi {
-  using ::int16;
-  using ::int32;
-  using ::int64;
-  using ::uint16;
-  using ::uint32;
-  using ::uint64;
-  typedef float   float32;
-  typedef double double64;
-}  // end namespace kaldi
-
-// In a theoretical case you decide compile Kaldi without the OpenFST
-// comment the previous namespace statement and uncomment the following
-/*
-namespace kaldi {
-  typedef int8_t   int8;
-  typedef int16_t  int16;
-  typedef int32_t  int32;
-  typedef int64_t  int64;
-
-  typedef uint8_t  uint8;
-  typedef uint16_t uint16;
-  typedef uint32_t uint32;
-  typedef uint64_t uint64;
-  typedef float    float32;
-  typedef double   double64;
-}  // end namespace kaldi
-*/
+typedef int8_t   int8;
+typedef int16_t  int16;
+typedef int32_t  int32;
+typedef int64_t  int64;
+
+typedef uint8_t  uint8;
+typedef uint16_t uint16;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef float    float32;
+typedef double   double64;
 
 #endif  // KALDI_BASE_KALDI_TYPES_H_
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index 5d52ed3aa5a..03ac9947c5c 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -270,7 +270,7 @@ void ConvertFstToLattice(
   fst::CacheOptions cache_opts(true, num_states_cache);
   fst::ArcMapFstOptions mapfst_opts(cache_opts);
   StdToLatticeMapper<Real> mapper;
-  MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
+  ArcMapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
          StdToLatticeMapper<Real> > map_fst(ifst, mapper, mapfst_opts);
   *ofst = map_fst;
 }
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index 3e27226f13c..e6819562f82 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -175,7 +175,7 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 
   // Now we map the CompactLattice to VectorFst<KwsProductArc>. We drop the
   // alignment information and only keep the negated log-probs
-  Map(clat, factor_transducer, CompactLatticeToKwsProductFstMapper());
+  ArcMap(clat, factor_transducer, CompactLatticeToKwsProductFstMapper());
 
   // Now do the weight pushing manually on the CompactLattice format. Note that
   // the alphas and betas in Kaldi are stored as the log-probs, not the negated
@@ -366,7 +366,7 @@ void MaybeDoSanityCheck(const KwsProductFst &product_transducer) {
   if (GetVerboseLevel() < 2) return;
   KwsLexicographicFst index_transducer;
 
-  Map(product_transducer,
+  ArcMap(product_transducer,
       &index_transducer,
       KwsProductFstToKwsLexicographicFstMapper());
 
diff --git a/src/kws/kws-functions2.cc b/src/kws/kws-functions2.cc
index 71f5583af19..9e610d2054e 100644
--- a/src/kws/kws-functions2.cc
+++ b/src/kws/kws-functions2.cc
@@ -92,7 +92,7 @@ void DoFactorMerging(KwsProductFst *factor_transducer,
 
   Decode(&dest_transducer, encoder);
 
-  Map(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
+  ArcMap(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
 }
 
 void DoFactorDisambiguation(KwsLexicographicFst *index_transducer) {
diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h
index 5c0c6d3c416..39775ac8950 100644
--- a/src/lat/arctic-weight.h
+++ b/src/lat/arctic-weight.h
@@ -50,7 +50,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
 
   static const std::string &Type() {
     static const std::string type = std::string("arctic") +
-        FloatWeightTpl<T>::GetPrecisionString();
+        std::string(FloatWeightTpl<T>::GetPrecisionString());
     return type;
   }
 
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index dbdd9af4645..ff3d65d57f3 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -1499,7 +1499,7 @@ bool DeterminizeLatticePhonePrunedWrapper(
   }
   ILabelCompare<kaldi::LatticeArc> ilabel_comp;
   ArcSort(ifst, ilabel_comp);
-  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
       trans_model, ifst, beam, ofst, opts);
   Connect(ofst);
   return ans;
@@ -1523,7 +1523,7 @@ bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     DeterminizeLatticePrunedOptions opts);
 
 template
-bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
     const kaldi::TransitionInformation &trans_model,
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
@@ -1531,7 +1531,7 @@ bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
     DeterminizeLatticePhonePrunedOptions opts);
 
 template
-bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
     const kaldi::TransitionInformation &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
diff --git a/src/lat/minimize-lattice.cc b/src/lat/minimize-lattice.cc
index ada90efadce..416f1e62e93 100644
--- a/src/lat/minimize-lattice.cc
+++ b/src/lat/minimize-lattice.cc
@@ -279,7 +279,7 @@ bool MinimizeCompactLattice(
 
 // Instantiate for CompactLattice type.
 template
-bool MinimizeCompactLattice<kaldi::LatticeWeight, kaldi::int32>(
+bool MinimizeCompactLattice<kaldi::LatticeWeight, int32>(
     MutableFst<kaldi::CompactLatticeArc> *clat, float delta);
   
 
diff --git a/src/lat/push-lattice.cc b/src/lat/push-lattice.cc
index f4eb322d002..38a990d74d3 100644
--- a/src/lat/push-lattice.cc
+++ b/src/lat/push-lattice.cc
@@ -280,11 +280,11 @@ bool PushCompactLatticeWeights(
 
 // Instantiate for CompactLattice.
 template
-bool PushCompactLatticeStrings<kaldi::LatticeWeight, kaldi::int32>(
+bool PushCompactLatticeStrings<kaldi::LatticeWeight, int32>(
    MutableFst<kaldi::CompactLatticeArc> *clat);
 
 template
-bool PushCompactLatticeWeights<kaldi::LatticeWeight, kaldi::int32>(
+bool PushCompactLatticeWeights<kaldi::LatticeWeight, int32>(
    MutableFst<kaldi::CompactLatticeArc> *clat);
 
 }  // namespace fst
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index b851bc3604c..03b384f93f1 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -325,7 +325,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   // paper (i.e. just one final state).
 
   // Topologically sort the lattice, if not already sorted.
-  kaldi::uint64 props = clat->Properties(fst::kFstProperties, false);
+  uint64 props = clat->Properties(fst::kFstProperties, false);
   if (!(props & fst::kTopSorted)) {
     if (fst::TopSort(clat) == false)
       KALDI_ERR << "Cycles detected in lattice.";
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 0e07834ed3d..fd84c4e56fe 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -1503,7 +1503,7 @@ NnetBatchDecoder::~NnetBatchDecoder() {
   }
   // Print diagnostics.
 
-  kaldi::int64 input_frame_count =
+  int64 input_frame_count =
       frame_count_ * computer_->GetOptions().frame_subsampling_factor;
   int32 num_threads = static_cast<int32>(decode_threads_.size());
 
diff --git a/src/online/online-tcp-source.cc b/src/online/online-tcp-source.cc
index 6d63493b4bd..8421073d559 100644
--- a/src/online/online-tcp-source.cc
+++ b/src/online/online-tcp-source.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-typedef kaldi::int32 int32;
+typedef int32 int32;
 
 OnlineTcpVectorSource::OnlineTcpVectorSource(int32 socket)
     : socket_desc(socket),
diff --git a/src/rnnlm/rnnlm-test-utils.cc b/src/rnnlm/rnnlm-test-utils.cc
index 32e8b5a4236..f415f257a06 100644
--- a/src/rnnlm/rnnlm-test-utils.cc
+++ b/src/rnnlm/rnnlm-test-utils.cc
@@ -78,7 +78,7 @@ void ConvertToInteger(
   for (int i = 0; i < string_sentences.size(); i++) {
     (*int_sentences)[i].resize(string_sentences[i].size());
     for (int j = 0; j < string_sentences[i].size(); j++) {
-      kaldi::int64 key = symbol_table.Find(string_sentences[i][j]);
+      int64 key = symbol_table.Find(string_sentences[i][j]);
       KALDI_ASSERT(key != -1); // fst::kNoSymbol
       (*int_sentences)[i][j] = static_cast<int32>(key);
     }
diff --git a/src/tree/tree-renderer.cc b/src/tree/tree-renderer.cc
index bbaa5cda162..8e3b463fe7a 100644
--- a/src/tree/tree-renderer.cc
+++ b/src/tree/tree-renderer.cc
@@ -67,7 +67,7 @@ TreeRenderer::MakeEdgeLabel(const EventKeyType &key,
       oss << ", ";
     if (key != kPdfClass) {
       std::string phone =
-          phone_syms_.Find(static_cast<kaldi::int64>(*child));
+          phone_syms_.Find(static_cast<int64>(*child));
       if (phone.empty())
         KALDI_ERR << "No phone found for Phone ID " << *child;
       oss << phone;
@@ -137,7 +137,7 @@ void TreeRenderer::RenderTable(const EventType *query, int32 id) {
         ExpectToken(is_, binary_, "NULL"); // consume the invalid/NULL entry
         continue;
       }
-      std::string phone = phone_syms_.Find(static_cast<kaldi::int64>(t));
+      std::string phone = phone_syms_.Find(static_cast<int64>(t));
       if (phone.empty())
           KALDI_ERR << "Phone ID found in a TableEventMap, but not in the "
                     << "phone symbol table! ID: " << t;

From 122a3f239ed2f24271eb61b9aa3060fa06b820ac Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Thu, 25 Jul 2024 09:57:49 +0200
Subject: [PATCH 68/76] make nonconst catches const (#4926)

---
 src/hmm/posterior.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 860a979a0ce..bce0c84ad79 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -146,7 +146,7 @@ bool PosteriorHolder::Read(std::istream &is) {
   try {
     ReadPosterior(is, is_binary, &t_);
     return true;
-  } catch (std::exception &e) {
+  } catch (const std::exception &e) {
     KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
@@ -207,7 +207,7 @@ bool GaussPostHolder::Read(std::istream &is) {
       }
     }
     return true;
-  } catch (std::exception &e) {
+  } catch (const std::exception &e) {
     KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;

From 57efb6b5e6baf55538864bcd9f48c822b8064f09 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Thu, 25 Jul 2024 11:19:09 +0200
Subject: [PATCH 69/76] add support for later openfst versions

---
 src/base/kaldi-error-test.cc               |  2 +-
 src/base/kaldi-types.h                     | 14 +++++++
 src/bin/phones-to-prons.cc                 |  3 +-
 src/configure                              | 10 ++++-
 src/fstext/context-fst-test.cc             |  6 ++-
 src/fstext/determinize-lattice-test.cc     | 14 ++++---
 src/fstext/determinize-star-test.cc        | 47 +++++++++++-----------
 src/fstext/factor-test.cc                  |  5 ++-
 src/fstext/fstext-lib.h                    |  3 ++
 src/fstext/fstext-utils-inl.h              | 11 +++++
 src/fstext/fstext-utils-test.cc            |  8 ++--
 src/fstext/kaldi-fst-io-inl.h              |  5 ++-
 src/fstext/kaldi-fst-io.h                  |  1 +
 src/fstext/lattice-utils-test.cc           | 22 +++++-----
 src/fstext/lattice-weight.h                |  1 +
 src/fstext/openfst_compat.h                | 42 +++++++++++++++++++
 src/fstext/pre-determinize-inl.h           |  5 +++
 src/fstext/pre-determinize-test.cc         | 19 +++++----
 src/fstext/prune-special-test.cc           |  8 ++--
 src/fstext/push-special-test.cc            |  6 ++-
 src/fstext/remove-eps-local-test.cc        |  9 +++--
 src/fstext/table-matcher-test.cc           | 18 +++++----
 src/fstext/table-matcher.h                 |  2 +-
 src/fstext/trivial-factor-weight-test.cc   | 17 ++++----
 src/kwsbin/kws-search.cc                   |  2 +
 src/lat/determinize-lattice-pruned-test.cc | 14 ++++---
 src/lat/kaldi-lattice.cc                   |  9 ++++-
 src/lat/push-lattice-test.cc               |  6 ++-
 src/latbin/lattice-oracle.cc               |  2 +
 src/makefiles/android_openblas.mk          |  3 +-
 src/makefiles/cygwin.mk                    |  3 +-
 src/makefiles/darwin.mk                    |  3 +-
 src/makefiles/darwin_arm64.mk              |  3 +-
 src/makefiles/darwin_clapack.mk            |  3 +-
 src/makefiles/linux_atlas.mk               |  3 +-
 src/makefiles/linux_atlas_arm.mk           |  3 +-
 src/makefiles/linux_atlas_ppc64le.mk       |  3 +-
 src/makefiles/linux_clapack.mk             |  3 +-
 src/makefiles/linux_clapack_arm.mk         |  3 +-
 src/makefiles/linux_openblas.mk            |  3 +-
 src/makefiles/linux_openblas_aarch64.mk    |  3 +-
 src/makefiles/linux_openblas_arm.mk        |  3 +-
 src/makefiles/linux_openblas_ppc64le.mk    |  3 +-
 src/makefiles/linux_x86_64_mkl.mk          |  3 +-
 tools/Makefile                             |  2 +-
 45 files changed, 246 insertions(+), 112 deletions(-)
 create mode 100644 src/fstext/openfst_compat.h

diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 31440edf3f9..68ef224b5f5 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -76,7 +76,7 @@ int main() {
     kaldi::UnitTestError();
     KALDI_ASSERT(0); // should not happen.
     exit(1);
-  } catch (kaldi::KaldiFatalError &e) {
+  } catch (const kaldi::KaldiFatalError &e) {
     std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n";
   }
 }
diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
index 68d5578a5fb..6d96ecf2b75 100644
--- a/src/base/kaldi-types.h
+++ b/src/base/kaldi-types.h
@@ -39,6 +39,7 @@ typedef float   BaseFloat;
 // we find in the future lacks stdint.h
 #include <stdint.h>
 
+#if OPENFST_VER >= 10800
 typedef int8_t   int8;
 typedef int16_t  int16;
 typedef int32_t  int32;
@@ -50,5 +51,18 @@ typedef uint32_t uint32;
 typedef uint64_t uint64;
 typedef float    float32;
 typedef double   double64;
+#else
+#include <fst/types.h>
+#endif
 
+namespace kaldi {
+  using ::int16;
+  using ::int32;
+  using ::int64;
+  using ::uint16;
+  using ::uint32;
+  using ::uint64;
+  typedef float   float32;
+  typedef double double64;
+}  // end namespace kaldi
 #endif  // KALDI_BASE_KALDI_TYPES_H_
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 0d7ab12c232..535c18365ed 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -172,7 +172,8 @@ int main(int argc, char *argv[]) {
         if (g_kaldi_verbose_level >= 2) {
           KALDI_LOG << "phn2word FST is below:";
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
-          fstprinter.Print(&std::cerr, "standard error");
+          printer_print(std::cerr, fstprinter, "standard error");
+          //fstprinter.Print(&std::cerr, "standard error");
           KALDI_LOG << "phone sequence is: ";
           for (size_t i = 0; i < phones.size(); i++)
             std::cerr << phones[i] << ' ';
diff --git a/src/configure b/src/configure
index c1e44512af9..3743c31f76b 100755
--- a/src/configure
+++ b/src/configure
@@ -39,7 +39,7 @@
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=14
+CONFIGURE_VERSION=15
 
 # We support bash version 3.2 (Macs still ship with this version as of 2019)
 # and above.
@@ -1024,6 +1024,14 @@ OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"
 if [ $OPENFST_VER_NUM -lt 10600 ]; then
   failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)"
 fi
+
+if [ $OPENFST_VER_NUM -lt 10800 ]; then
+  echo "CXXLANGVERSION = c++14"
+else
+  echo "CXXLANGVERSION = c++17"
+fi >> kaldi.mk
+
+echo "OPENFSTVER = $OPENFST_VER_NUM" >> kaldi.mk
 echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
 if $static_fst ; then
   OPENFSTLIBS="$FSTROOT/lib/libfst.a"
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index 65da1bb0797..2589c5c344e 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -23,6 +23,8 @@
 #include "util/kaldi-io.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst
 {
 using std::vector;
@@ -196,7 +198,7 @@ static void TestContextFst(bool verbose, bool use_matcher) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
         FstPrinter<Arc> fstprinter(*f, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
     }
 
@@ -224,7 +226,7 @@ static void TestContextFst(bool verbose, bool use_matcher) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
         FstPrinter<Arc> fstprinter(fst_composed, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
     }
 
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index 886aa4cc1b9..ae902021c7d 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -22,6 +22,8 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 using std::vector;
 using std::cout;
@@ -94,7 +96,7 @@ template<class Arc> void TestDeterminizeLattice() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
@@ -106,7 +108,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "FST after lattice-determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
       // OK, now determinize it a different way and check equivalence.
@@ -117,7 +119,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "Compact FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       if (kaldi::Rand() % 2 == 1)
         ConvertLattice<Weight, Int>(det_fst, &compact_det_fst, false);
@@ -128,7 +130,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "Compact version of determinized FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
@@ -149,14 +151,14 @@ template<class Arc> void TestDeterminizeLattice2() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLattice<TropicalWeight, int32>(*fst, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     delete fst;
   }
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index 814e6a38d9b..c3fabb8a21e 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -24,6 +24,7 @@
 #include "fstext/trivial-factor-weight.h"
 #include "fstext/fst-test-utils.h"
 
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -38,7 +39,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     std::cout << "FST before determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> ofst;
     try {
@@ -46,7 +47,7 @@ template<class Arc> void TestDeterminizeGeneral() {
       std::cout << "FST after determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
@@ -101,7 +102,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -109,7 +110,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -122,7 +123,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -138,7 +139,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   VectorFst<Arc> ofst_orig;
   VectorFst<Arc> ofst_star;
@@ -157,14 +158,14 @@ template<class Arc>  void TestDeterminize() {
   {
     std::cout <<" printing after determinization [baseline]\n";
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
   {
     std::cout <<" printing after determinization [star]\n";
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
@@ -174,7 +175,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
@@ -242,7 +243,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -250,7 +251,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -267,7 +268,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing after pushing\n";
   {
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   assert(RandEquivalent(*fst, fst_pushed, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -320,7 +321,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -328,7 +329,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -341,7 +342,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -357,7 +358,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   VectorFst<Arc> ofst_orig;
   VectorFst<Arc> ofst_star;
@@ -370,7 +371,7 @@ template<class Arc>  void TestMinimize() {
   {
     std::cout <<" printing after determinization [baseline]\n";
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -382,7 +383,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -392,7 +393,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after pushing weights [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -401,7 +402,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after  minimization [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     printf("Converting gallic back to regular [my approach]\n");
@@ -410,7 +411,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing factor-weight FST\n";
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
@@ -418,7 +419,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after converting back to regular FST\n";
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
   }
@@ -431,7 +432,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index 687d0ad59b3..d58dbfa539c 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -23,6 +23,7 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -79,7 +80,7 @@ template<class Arc> static void TestFactor() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(&fst);
@@ -87,7 +88,7 @@ template<class Arc> static void TestFactor() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   if (fst.Start() == kNoStateId) return;  // "Connect" made it empty.
diff --git a/src/fstext/fstext-lib.h b/src/fstext/fstext-lib.h
index bdb8ff730e5..03c8e5861dd 100644
--- a/src/fstext/fstext-lib.h
+++ b/src/fstext/fstext-lib.h
@@ -20,6 +20,9 @@
 #ifndef KALDI_FSTEXT_FSTEXT_LIB_H_
 #define KALDI_FSTEXT_FSTEXT_LIB_H_
 #include "fst/fstlib.h"
+
+#include "fstext/openfst_compat.h"
+
 #include "fstext/context-fst.h"
 #include "fstext/determinize-star.h"
 #include "fstext/factor.h"
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 44e641a3f20..fb3a637bc19 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -374,6 +374,7 @@ void GetSymbols(const SymbolTable &symtab,
                 std::vector<I> *syms_out) {
   KALDI_ASSERT(syms_out != NULL);
   syms_out->clear();
+#if OPENFST_VER >= 10800
   for (SymbolTable::iterator iter = symtab.begin();
       iter != symtab.end();
       ++iter) {
@@ -382,6 +383,16 @@ void GetSymbols(const SymbolTable &symtab,
       KALDI_ASSERT(syms_out->back() == iter->Label());  // an integer-range thing.
     }
   }
+#else
+  for (SymbolTableIterator iter(symtab);
+      !iter.Done();
+      iter.Next()) {
+    if (include_eps || iter.Value() != 0) {
+      syms_out->push_back(iter.Value());
+      KALDI_ASSERT(syms_out->back() == iter.Value());  // an integer-range thing.
+    }
+  }
+#endif
 }
 
 template<class Arc>
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 4ce296f093a..460e49c7dec 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -23,6 +23,8 @@
 #include "util/stl-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst
 {
 using std::vector;
@@ -140,7 +142,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -148,7 +150,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -362,7 +364,7 @@ void TestEqualAlign() {
 template<class Arc> void Print(const Fst<Arc> &fst, std::string message) {
   std::cout << message << "\n";
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true, "\t");
-  fstprinter.Print(&std::cout, "standard output");
+  printer_print(std::cout, fstprinter, "standard output");
 }
 
 
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 01047919c22..3baa5b95c9c 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -24,6 +24,8 @@
 
 #include "util/text-utils.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 
 
@@ -44,7 +46,8 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     bool acceptor = false, write_one = false;
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
-    printer.Print(os, "<unknown>");
+    //printer.Print(&os, "<unknown>");
+    printer_print(os, printer, "<unknown>");
     if (os.fail())
       KALDI_ERR << "Stream failure detected writing FST to stream";
     // Write another newline as a terminating character.  The read routine will
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index a45920936ec..3c34f4b4787 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -26,6 +26,7 @@
 #include <fst/fst-decl.h>
 #include <fst/script/print-impl.h>
 #include "base/kaldi-common.h"
+#include "fstext/openfst_compat.h"
 
 // Some functions for writing Fsts.
 // I/O for FSTs is a bit of a mess, and not very well integrated with Kaldi's
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index aa931d47d07..6f1d2747cc1 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -21,6 +21,8 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 
 template<class Weight, class Int> void TestConvert(bool invert) {
@@ -31,7 +33,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -39,14 +41,14 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     std::cout << "FST after converting is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     std::cout << "FST after back conversion is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -67,7 +69,7 @@ template<class Weight, class Int> void TestShortestPath() {
       std::cout << "FST before converting to compact-arc is:\n";
       {
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       VectorFst<CompactArc> cfst;
       ConvertLattice<Weight, Int>(*fst, &cfst, false); // invert == false
@@ -205,7 +207,7 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
     /*std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -213,14 +215,14 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
     /*std::cout << "FST after converting is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     /*std::cout << "FST after back conversion is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
 
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -260,7 +262,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -268,7 +270,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST after converting and scaling is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
@@ -276,7 +278,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST after back conversion and scaling is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     // If RandEquivalent doesn't work, it could be due to a nasty issue related to the use
     // of exact floating-point comparisons in the Plus function of LatticeWeight.
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index f03ed702588..1396764000a 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -23,6 +23,7 @@
 
 #include "fst/fstlib.h"
 #include "base/kaldi-common.h"
+#include "fstext/openfst_compat.h"
 
 namespace fst {
 
diff --git a/src/fstext/openfst_compat.h b/src/fstext/openfst_compat.h
new file mode 100644
index 00000000000..251d3f893c5
--- /dev/null
+++ b/src/fstext/openfst_compat.h
@@ -0,0 +1,42 @@
+#ifndef KALDI_FSTEXT_OPENFST_COMPAT_H
+#define KALDI_FSTEXT_OPENFST_COMPAT_H
+
+
+#if OPENFST_VER < 10800
+#define FST_FLAGS_fst_weight_separator FLAGS_fst_weight_separator
+#define FST_FLAGS_fst_field_separator FLAGS_fst_field_separator
+#define FST_FLAGS_v FLAGS_v
+
+#endif
+
+namespace fst {
+#if OPENFST_VER >= 10800
+
+
+template <typename... Args>
+auto Map(Args&&... args) -> decltype(ArcMap(std::forward<Args>(args)...)) {
+  return ArcMap(std::forward<Args>(args)...);
+}
+
+using MapFstOptions=ArcMapFstOptions;
+
+template <class A, class B, class C>
+using MapFst = ArcMapFst<A, B, C>;
+
+template<typename Printer, typename Stream>
+void printer_print(Stream &os, Printer &printer, const std::string &s) {
+  printer.Print(os, s);
+}
+
+#else
+
+template<typename Printer, typename Stream>
+void printer_print(Stream &os, Printer &printer, const std::string &s) {
+  printer.Print(&os, s);
+}
+
+#endif
+
+}  // namespace fst
+
+#endif  //KALDI_FSTEXT_OPENFST_COMPAT_H
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index 7c1b544da4c..45e1a82279a 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -235,8 +235,13 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix,
   assert(symTable != NULL);
   const char *prefix_ptr = prefix.c_str();
   size_t prefix_len = strlen(prefix_ptr);  // allowed to be zero but not encouraged.
+#if OPENFST_VER >= 10800
   for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) {
     const std::string &sym = siter->Symbol();
+#else
+  for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) {
+    const std::string &sym = siter.Symbol();
+#endif
     if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) {  // has prefix.
       if (isdigit(sym[prefix_len])) {  // we don't allow prefix followed by a digit, as a symbol.
         // Has at least one digit.
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index 7210e455413..60953e40b8d 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -22,8 +22,7 @@
 #include "fstext/fst-test-utils.h"
 #include "fstext/fstext-utils.h"
 
-// Just check that it compiles, for now.
-
+#include "fstext/openfst_compat.h"
 namespace fst
 {
   using std::vector;
@@ -73,7 +72,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -81,7 +80,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -95,7 +94,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -111,7 +110,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -121,14 +120,14 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after determinization\n";
   {
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   int64 num_removed = DeleteISymbols(&ofst, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
@@ -180,7 +179,7 @@ template<class Arc>  void TestAddSelfLoops() {
   std::cout <<" printing before adding self-loops\n";
   {
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -199,7 +198,7 @@ template<class Arc>  void TestAddSelfLoops() {
   std::cout <<" printing after adding self-loops\n";
   {
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   delete fst;
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index 5d8c40b6a75..f91001fca0d 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -22,6 +22,8 @@
 #include "fstext/rand-fst.h"
 #include "fstext/fstext-utils.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 
 static void TestPruneSpecial() {
@@ -38,7 +40,7 @@ static void TestPruneSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     std::cout << std::endl;
   }
 
@@ -47,7 +49,7 @@ static void TestPruneSpecial() {
   PruneSpecial<StdArc>(*ifst, &ofst1, beam);
   {
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     std::cout << std::endl;
   }
 
@@ -56,7 +58,7 @@ static void TestPruneSpecial() {
   Prune(*ifst, &ofst2, beam);
   {
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     std::cout << std::endl;
   }
 
diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc
index 557b43d3062..9fe8ba63b59 100644
--- a/src/fstext/push-special-test.cc
+++ b/src/fstext/push-special-test.cc
@@ -23,6 +23,8 @@
 #include "fstext/fstext-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst
 {
 
@@ -38,7 +40,7 @@ static void TestPushSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> fst_copy(*fst);
@@ -56,7 +58,7 @@ static void TestPushSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value();
 
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index 80cca875ff0..1548ac5c726 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -23,6 +23,7 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -83,7 +84,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> fst_copy1(fst);
@@ -96,7 +97,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
   {
     std::cout << "copy1 = \n";
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -141,7 +142,7 @@ static void TestRemoveEpsLocalSpecial() {
   {
     std::cout << "logfst = \n";
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<StdArc> fst;
@@ -156,7 +157,7 @@ static void TestRemoveEpsLocalSpecial() {
   {
     std::cout << "logfst2 = \n";
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) {
     // make sure we preserved stochasticity in cases where doing so was
diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc
index 2d39fe957dd..1cc8bd02bef 100644
--- a/src/fstext/table-matcher-test.cc
+++ b/src/fstext/table-matcher-test.cc
@@ -21,6 +21,8 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst{
 
 
@@ -64,13 +66,13 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
   std::cout <<"Table-Composed FST\n";
   {
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Baseline-Composed FST\n";
   {
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   if ( !RandEquivalent(composed, composed_baseline, 3/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 20/*path length-- max?*/)) {
@@ -79,7 +81,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     std::cout <<" Diff1 (composed - baseline) \n";
     {
       FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -88,7 +90,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     std::cout <<" Diff2 (baseline - composed) \n";
     {
       FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     assert(0);
@@ -149,7 +151,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       std::cout <<" Diff1 (composed - baseline) \n";
       {
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
 
@@ -158,7 +160,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       std::cout <<" Diff2 (baseline - composed) \n";
       {
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       assert(0);
@@ -219,7 +221,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       std::cout <<" Diff1 (composed - baseline) \n";
       {
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
 
@@ -228,7 +230,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       std::cout <<" Diff2 (baseline - composed) \n";
       {
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       assert(0);
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 290a4f8bc2e..9e921920c48 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -22,7 +22,7 @@
 #include <fst/fstlib.h>
 #include <fst/fst-decl.h>
 
-
+#include "base/kaldi-types.h"
 
 namespace fst {
 
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index b4682443d29..556d194a60d 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -22,7 +22,8 @@
 #include "fstext/determinize-star.h"
 #include "fstext/trivial-factor-weight.h"
 #include "fstext/fst-test-utils.h"
-// Just check that it compiles, for now.
+
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -73,7 +74,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -81,7 +82,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   vector<Label> extra_syms;
@@ -92,7 +93,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -108,7 +109,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after double-epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   VectorFst<Arc> ofst_star;
 
@@ -127,7 +128,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing gallic FST\n";
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -139,7 +140,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing factor-weight FST\n";
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
@@ -147,7 +148,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing after converting back to regular FST\n";
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index c76a5d46eb9..a68c8965588 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -25,6 +25,8 @@
 #include "fstext/kaldi-fst-io.h"
 #include "kws/kaldi-kws.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace kaldi {
 
 typedef KwsLexicographicArc Arc;
diff --git a/src/lat/determinize-lattice-pruned-test.cc b/src/lat/determinize-lattice-pruned-test.cc
index f6684f0b5b5..1c16906b090 100644
--- a/src/lat/determinize-lattice-pruned-test.cc
+++ b/src/lat/determinize-lattice-pruned-test.cc
@@ -24,6 +24,8 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 // Caution: these tests are not as generic as you might think from all the
 // templates in the code.  They are basically only valid for LatticeArc.
@@ -63,7 +65,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
@@ -76,7 +78,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       std::cout << "FST after lattice-determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       KALDI_ASSERT(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
       // OK, now determinize it a different way and check equivalence.
@@ -93,14 +95,14 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       std::cout << "Compact pruned FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       ConvertLattice<Weight, Int>(det_fst, &compact_pruned_det_fst, false);
 
       std::cout << "Compact version of determinized FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(&std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       if (ans)
@@ -123,14 +125,14 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(&std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     delete fst;
   }
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index 70fde5acfa0..49bb8b67459 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -22,8 +22,11 @@
 #include "lat/kaldi-lattice.h"
 #include "fst/script/print-impl.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace kaldi {
 
+
 /// Converts lattice types if necessary, deleting its input.
 template<class OrigWeightType>
 CompactLattice* ConvertToCompactLattice(fst::VectorFst<OrigWeightType> *ifst) {
@@ -78,7 +81,8 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
-    printer.Print(os, "<unknown>");
+    //printer.Print(&os, "<unknown>");
+    printer_print(os, printer, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
@@ -403,7 +407,8 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
-    printer.Print(os, "<unknown>");
+    //printer.Print(&os, "<unknown>");
+    printer_print(os, printer, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index c2643292f11..15915837d56 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -22,6 +22,8 @@
 #include "lat/push-lattice.h"
 #include "fstext/rand-fst.h"
 
+#include "fstext/openfst_compat.h"
+
 
 namespace kaldi {
 using namespace fst;
@@ -92,12 +94,12 @@ void TestPushCompactLatticeWeights() {
       {
         fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
                                                    NULL, true, true, "\t");
-        printer.Print(&std::cerr, "<unknown>");
+        printer_print(std::cerr, printer, "<unknown>");
       }
       {
         fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
                                                    NULL, true, true, "\t");
-        printer.Print(&std::cerr, "<unknown>");
+        printer_print(std::cerr, printer, "<unknown>");
       }
       KALDI_ERR << "Bad lattice being pushed.";
     }
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index 054a0676e37..f6ce790d51d 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -25,6 +25,8 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace kaldi {
 
  using std::string;
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index edab38e0270..bc54636f59b 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -25,9 +25,10 @@ $(error Android build does not support compiling with $(CXX).
         Supported compilers: clang++)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
            -I$(OPENBLASINC) -I$(ANDROIDINC) -ftree-vectorize -mfloat-abi=softfp \
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index f9e73b90a47..fe04a2f1898 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -10,9 +10,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CLAPACK -I../../tools/CLAPACK/ \
            -msse -msse2 -O -Wa,-mbig-obj \
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 58eedaec8ae..8d8bd47cf2c 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -10,9 +10,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/darwin_arm64.mk b/src/makefiles/darwin_arm64.mk
index 149a3d97118..fd571f1db72 100644
--- a/src/makefiles/darwin_arm64.mk
+++ b/src/makefiles/darwin_arm64.mk
@@ -10,9 +10,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
            -pthread \
diff --git a/src/makefiles/darwin_clapack.mk b/src/makefiles/darwin_clapack.mk
index 9974fb4b959..983efbb77a0 100644
--- a/src/makefiles/darwin_clapack.mk
+++ b/src/makefiles/darwin_clapack.mk
@@ -17,9 +17,10 @@ CLAPACKLIBS = $(CLAPACKROOT)/CLAPACK-3.2.1/lapack.a $(CLAPACKROOT)/CLAPACK-3.2.1
 	      $(CLAPACKROOT)/CBLAS/lib/cblas.a \
 	      $(CLAPACKROOT)/f2c_BLAS-3.8.0/blas.a $(CLAPACKROOT)/libf2c/libf2c.a
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 \
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index d1443bee0b1..576b608b88b 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -19,9 +19,10 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index b8fc1ffe7e9..4e38ac20b66 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -16,9 +16,10 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index 778710b2466..fbd8bfad9f3 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -19,9 +19,10 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index bd6f611fc71..c1c47a1cf4d 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -17,9 +17,10 @@ CLAPACKLIBS = $(CLAPACKROOT)/CLAPACK-3.2.1/lapack.a $(CLAPACKROOT)/CLAPACK-3.2.1
 	      $(CLAPACKROOT)/CBLAS/lib/cblas.a \
 	      $(CLAPACKROOT)/f2c_BLAS-3.8.0/blas.a $(CLAPACKROOT)/libf2c/libf2c.a
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 \
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 21fed81b6d5..b22a81ce382 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -13,9 +13,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index bdcc0b9808d..5a123666b52 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/linux_openblas_aarch64.mk b/src/makefiles/linux_openblas_aarch64.mk
index 1895d289127..4a230ca555e 100644
--- a/src/makefiles/linux_openblas_aarch64.mk
+++ b/src/makefiles/linux_openblas_aarch64.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -pthread \
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 780e1df2e5a..9847913e5df 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 6ab9507fb93..21bb69b88df 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 896428c184f..2b5fd3be654 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -17,9 +17,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL $(MKL_CXXFLAGS) \
            -m64 -msse -msse2 -pthread -g
diff --git a/tools/Makefile b/tools/Makefile
index bf95eec588c..a393e4f5a70 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,7 @@ CC ?= gcc        # used for sph2pipe
 
 WGET ?= wget
 
-OPENFST_VERSION ?= 1.7.2
+OPENFST_VERSION ?= 1.8.3
 CUB_VERSION ?= 1.8.0
 # No '?=', since there exists only one version of sph2pipe.
 SPH2PIPE_VERSION = 2.5

From a7fcc2d4e4c0d0b0c3b720f20e67b242724e4d8f Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Thu, 25 Jul 2024 12:54:56 +0200
Subject: [PATCH 70/76] fix the non-compiling tests, thx to @csukuangfj

---
 src/fstext/trivial-factor-weight.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index 044e83b1f96..a8108e6b15c 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -390,7 +390,11 @@ class ArcIterator< TrivialFactorWeightFst<A, F> >
 template <class A, class F>
 inline void TrivialFactorWeightFst<A, F>::InitStateIterator(
     StateIteratorData<A> *data) const {
+#if OPENFST_VER >= 10803
+  data->base.reset(new StateIterator< TrivialFactorWeightFst<A, F> >(*this));
+#else
   data->base = new StateIterator< TrivialFactorWeightFst<A, F> >(*this);
+#endif
 }
 
 

From 5bfb4312015685fccd0ebbae5984bc3a76953240 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Thu, 12 Sep 2024 12:22:21 +0200
Subject: [PATCH 71/76] fix cuda build with openfst 1.8.3 (#4936)

---
 src/makefiles/cuda_64bit.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 8785371b3e1..2931881d2bd 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,10 +5,10 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
+CXXFLAGS += -DHAVE_CUDA -DOPENFST_VER=$(OPENFSTVER) -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT) -I.. -isystem $(OPENFSTINC)
-CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA \
+CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA -DOPENFST_VER=$(OPENFSTVER)\
              -ccbin $(lastword $(CXX)) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
              -std=c++14 -DCUDA_API_PER_THREAD_DEFAULT_STREAM -lineinfo \
              --verbose -Wno-deprecated-gpu-targets

From 23e8653a7cf3f1b2760b1a54dd80d394f0f2fbbb Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Mon, 16 Sep 2024 07:21:57 +0200
Subject: [PATCH 72/76] upload the error logs to artifact repository (#4938)

---
 .github/workflows/c-cpp.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 8a21c82ea8f..18978067f67 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -38,3 +38,8 @@ jobs:
       run: cd src &&  make -j 3
     - name: make test
       run: cd src && make test
+    - name: upload logs if failure
+      if: ${{ failure() }}
+      uses: actions/upload-artifact@v4
+         name: fail-logs
+         path: ${{ github.workspace }}/src/**/*testlog

From 61b68b69d8e3fbab270dffc9353a33de81811d64 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Mon, 16 Sep 2024 07:30:12 +0200
Subject: [PATCH 73/76] Upload logs after build failure (#4939)

* upload the error logs to artifact repository

* fix yaml error
---
 .github/workflows/c-cpp.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 18978067f67..67b9acbb624 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -41,5 +41,6 @@ jobs:
     - name: upload logs if failure
       if: ${{ failure() }}
       uses: actions/upload-artifact@v4
+      with:
          name: fail-logs
          path: ${{ github.workspace }}/src/**/*testlog

From 0aebdbe6a1a9c8fd9a55a47b49d0f4a9ed2ba6c4 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Mon, 16 Sep 2024 07:54:43 +0200
Subject: [PATCH 74/76] [CI] Upgrade checkout action (#4940)

* upgrade the checkout action on build pipeline

* upgrade the checkout action on build pipeline
---
 .github/workflows/c-cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 67b9acbb624..6b902ef54ad 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -16,7 +16,7 @@ jobs:
       CC: "ccache gcc"
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Install sox
       run: sudo apt-get install -y sox intel-mkl
     - name: Install python2

From 22f36e5b592b587af5ee50324271df55eff879ab Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Mon, 16 Sep 2024 15:32:12 +0200
Subject: [PATCH 75/76] make sure nvtx3 is used (#4937)

---
 src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc | 2 +-
 src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc        | 2 +-
 src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc       | 2 +-
 src/cudadecoder/cuda-decoder.cc                                | 2 +-
 src/cudadecoder/cuda-fst.cc                                    | 2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc            | 2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda.cc                   | 2 +-
 src/cudadecoderbin/batched-wav-nnet3-cuda2.cc                  | 2 +-
 src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu   | 2 +-
 src/cudafeat/feature-spectral-cuda.cu                          | 2 +-
 src/cudafeat/feature-window-cuda.cu                            | 2 +-
 src/cudafeat/online-batched-feature-pipeline-cuda.cc           | 2 +-
 src/cudafeat/online-ivector-feature-cuda.cc                    | 2 +-
 src/cudafeatbin/compute-online-feats-batched-cuda.cc           | 2 +-
 src/cudafeatbin/compute-online-feats-cuda.cc                   | 2 +-
 src/cudamatrix/cu-common.h                                     | 2 +-
 src/makefiles/cuda_64bit.mk                                    | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index ed0c0a2f5e9..bec20cb9e07 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -26,7 +26,7 @@
 
 #include "hipify.h"
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include <mutex>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index 23d0ca283a2..32d7ac40e12 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -31,7 +31,7 @@
 
 #include "hipify.h"
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include "base/kaldi-utils.h"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index 01d6b1165e7..4b30c568e73 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -28,7 +28,7 @@
 
 #include "hipify.h"
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 namespace kaldi {
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 056d563a791..15f29d27122 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -44,7 +44,7 @@
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include "base/kaldi-utils.h"
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 682485f6ce4..6b0d34f81b7 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -29,7 +29,7 @@
 #include "hipify.h"
 #else
 #include <cuda_runtime_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 namespace kaldi {
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index 2bc0a483a0f..a47ea2e2300 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -30,7 +30,7 @@
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include <algorithm>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 0e4a719bc75..06aac47b5e0 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -24,7 +24,7 @@
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 #include <sstream>
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index b2ad9254c67..b7a9d463214 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -27,7 +27,7 @@
 #else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include <sstream>
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index 856d2acab81..bc06ea32d69 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -25,7 +25,7 @@
 #include "hipify.h"
 #else
 #include <cub/cub.cuh>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include "cudafeat/lane-desc.h"
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index d8fc215b80b..7b514010562 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -24,7 +24,7 @@
 
 #include "hipify.h"
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #include <cub/cub.cuh>
 #endif
 
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index 60fe113d402..e001eb0790f 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -22,7 +22,7 @@
 
 #include "hipify.h"
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 #include "matrix/matrix-functions.h"
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index 7736f525237..e03fda01ca7 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -25,7 +25,7 @@
 
 #include "hipify.h"
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 
 namespace kaldi {
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index f96b2a81ce2..daf1c7dfbf9 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -26,7 +26,7 @@
 #endif
 #define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
 #else
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 #endif
 
diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
index 2cd6bbb6a93..e3f2ed75d30 100644
--- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
@@ -18,7 +18,7 @@
 #if HAVE_CUDA
 #ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 #endif
 
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
index 70380f8ccad..d54ba56be84 100644
--- a/src/cudafeatbin/compute-online-feats-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -17,7 +17,7 @@
 
 #if HAVE_CUDA == 1
 #ifndef __IS_HIP_COMPILE__
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 #endif
 #include "base/kaldi-common.h"
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 3206fe7e7f4..f7f45b8043a 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -44,7 +44,7 @@
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 
 #define GPU_WARP_SIZE 32
 #define GPU_MAX_THREADS_PER_BLOCK 1024
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 2931881d2bd..713d8982f9e 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -16,4 +16,4 @@ CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA -DOPENFST_VER=$(O
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
 
-CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
+CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft

From d9ab0465aa2849ff645c027110c48899d5ec6ca8 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@apptek.com>
Date: Mon, 16 Sep 2024 17:22:06 +0200
Subject: [PATCH 76/76] [CI] Automatically refresh docker images (#4942)

---
 .github/workflows/docker-images.yml           | 121 +++++++++++++++---
 .../Dockerfile                                |  23 ++--
 docker/debian12-cpu/Dockerfile                |  34 +++++
 docker/debian9.8-cpu/Dockerfile               |  43 -------
 docker/ubuntu16.04-gpu/Dockerfile             |  44 -------
 docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0   |  44 -------
 .../Dockerfile                                |  26 ++--
 .../Dockerfile                                |  27 ++--
 8 files changed, 168 insertions(+), 194 deletions(-)
 rename docker/{debian10-cpu => debian12-cpu-mkl}/Dockerfile (52%)
 create mode 100644 docker/debian12-cpu/Dockerfile
 delete mode 100644 docker/debian9.8-cpu/Dockerfile
 delete mode 100644 docker/ubuntu16.04-gpu/Dockerfile
 delete mode 100644 docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
 rename docker/{ubuntu22.04-cuda12.2.0 => ubuntu20.04-cuda11}/Dockerfile (57%)
 rename docker/{ubuntu18.04-cuda10.0 => ubuntu22.04-cuda12}/Dockerfile (61%)

diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml
index d0bb01c5bf6..f63b761b5e2 100644
--- a/.github/workflows/docker-images.yml
+++ b/.github/workflows/docker-images.yml
@@ -3,6 +3,7 @@ name: Docker Image CI
 on:
   schedule:
   - cron: '37 2 * * 1'
+
   workflow_dispatch:
     inputs:
       logLevel:
@@ -14,17 +15,20 @@ on:
           - info
           - warning
           - debug
+
 #  pull_request: #for debugging purposes
 #    branches: [ "master" ]
 
 jobs:
+
   enable_build:
-    if: github.repository == 'kaldi-asr/kaldi'
+    #if: github.repository_owner == 'jtrmal' || github.repository_owner == 'kaldi-asr'
+    if: github.repository_owner == 'kaldi-asr'
     runs-on: ubuntu-latest
     outputs:
       enabled: ${{ steps.set-enabled.outputs.enabled }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Set enabled
@@ -38,22 +42,34 @@ jobs:
             enabled=true
           fi
           echo "enabled: $enabled"
-          echo "::set-output name=enabled::${enabled}"
+          echo "enabled=${enabled}" >> $GITHUB_OUTPUT
 
 
-  docker-buildx-gpu:
+  docker-buildx-gpu-12:
     needs: enable_build
     if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - name: Maximize build space
+        uses: AdityaGarg8/remove-unwanted-software@v4.1
+        with:
+          remove-android: 'true'
+          remove-dotnet: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
+          remove-docker-images: 'true'
+          remove-large-packages: 'true'
+          remove-cached-tools: 'true'
+          remove-swapfile: 'false'
+          verbose: 'true'
+      - uses: actions/checkout@v4
       - name: Set up Docker Buildx
         id: buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
         with:
           install: true
       - name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -66,22 +82,34 @@ jobs:
           echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
       - name: Build and push
         run: |
-          cd docker/ubuntu18.04-cuda10.0/
-          docker build --push --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu18.04-cuda10.0 --tag kaldiasr/kaldi:gpu-ubuntu18.04-cuda10.0-$(date +%F) .
+          cd docker/ubuntu22.04-cuda12
+          docker build --push  --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12 --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12-$(date +%F) .
 
-  docker-buildx-cpu:
+  docker-buildx-gpu-cuda11:
     needs: enable_build
     if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - name: Maximize build space
+        uses: AdityaGarg8/remove-unwanted-software@v4.1
+        with:
+          remove-android: 'true'
+          remove-dotnet: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
+          remove-docker-images: 'true'
+          remove-large-packages: 'true'
+          remove-cached-tools: 'true'
+          remove-swapfile: 'false'
+          verbose: 'true'
+      - uses: actions/checkout@v4
       - name: Set up Docker Buildx
         id: buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
         with:
           install: true
       - name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -94,10 +122,71 @@ jobs:
           echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
       - name: Build and push
         run: |
-          cd docker/debian10-cpu/
+          cd docker/ubuntu20.04-cuda11
+          docker build --push  --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11 --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11-$(date +%F) .
+
+  docker-buildx-cpu-openblas:
+    needs: enable_build
+    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          install: true
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Inspect builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+      - name: Build and push
+        run: |
+          cd docker/debian12-cpu/
           docker build --push \
                        --tag kaldiasr/kaldi:latest \
                        --tag kaldiasr/kaldi:cpu-latest  \
-                       --tag kaldiasr/kaldi:cpu-debian10 \
-                       --tag kaldiasr/kaldi:cpu-debian10-$(date +%F) .
+                       --tag kaldiasr/kaldi:cpu-latest-openblas  \
+                       --tag kaldiasr/kaldi:cpu-debian12-openblas \
+                       --tag kaldiasr/kaldi:cpu-debian12-openblas-$(date +%F) .
 
+  docker-buildx-cpu-mkl:
+    needs: enable_build
+    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          install: true
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Inspect builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+      - name: Build and push
+        run: |
+          cd docker/debian12-cpu-mkl/
+          docker build --push \
+                       --tag kaldiasr/kaldi:cpu-latest-mkl  \
+                       --tag kaldiasr/kaldi:cpu-debian12-mkl \
+                       --tag kaldiasr/kaldi:cpu-debian12-mkl-$(date +%F) .
+ 
+ 
diff --git a/docker/debian10-cpu/Dockerfile b/docker/debian12-cpu-mkl/Dockerfile
similarity index 52%
rename from docker/debian10-cpu/Dockerfile
rename to docker/debian12-cpu-mkl/Dockerfile
index 05079922d03..aae82d24b93 100644
--- a/docker/debian10-cpu/Dockerfile
+++ b/docker/debian12-cpu-mkl/Dockerfile
@@ -1,9 +1,10 @@
-FROM debian:10
-LABEL maintainer="rick@scriptix.io"
+FROM debian:12
+LABEL maintainer="jtrmal@apptek.com"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         g++ \
+        gfortran \
         make \
         automake \
         autoconf \
@@ -13,29 +14,21 @@ RUN apt-get update && \
         sox \
         libtool \
         git \
-        subversion \
-        python2.7 \
         python3 \
         zlib1g-dev \
         ca-certificates \
-        gfortran \
         patch \
-        ffmpeg \
-	vim && \
+        python-is-python3 && \
     rm -rf /var/lib/apt/lists/*
 
-RUN ln -s /usr/bin/python3 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL
 RUN    cd /opt/kaldi/tools && \
-       ./extras/install_mkl.sh && \
-       make -j $(nproc) && \
+       ./extras/install_mkl.sh  && \
+       make -j 5 && \
        cd /opt/kaldi/src && \
        ./configure --shared && \
        make depend -j $(nproc) && \
-       make -j $(nproc) && \
-       find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-       find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-       find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-       rm -rf /opt/kaldi/.git
+       make -j 5 
+
 WORKDIR /opt/kaldi/
diff --git a/docker/debian12-cpu/Dockerfile b/docker/debian12-cpu/Dockerfile
new file mode 100644
index 00000000000..6c286d6ba24
--- /dev/null
+++ b/docker/debian12-cpu/Dockerfile
@@ -0,0 +1,34 @@
+FROM debian:12
+LABEL maintainer="jtrmal@apptek.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        gfortran \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        patch \
+        python-is-python3 && \
+    rm -rf /var/lib/apt/lists/*
+
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL
+RUN    cd /opt/kaldi/tools && \
+       ./extras/install_openblas.sh  && \
+       make -j 5 && \
+       cd /opt/kaldi/src && \
+       ./configure --shared --mathlib=OPENBLAS && \
+       make depend -j $(nproc) && \
+       make -j 5 
+
+WORKDIR /opt/kaldi/
diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile
deleted file mode 100644
index ba694d1fb96..00000000000
--- a/docker/debian9.8-cpu/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-
-FROM debian:9.8
-LABEL maintainer="mdoulaty@gmail.com"
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        g++ \
-        make \
-        automake \
-        autoconf \
-        bzip2 \
-        unzip \
-        wget \
-        sox \
-        libtool \
-        git \
-        subversion \
-        python2.7 \
-        python3 \
-        zlib1g-dev \
-        ca-certificates \
-        gfortran \
-        patch \
-        ffmpeg \
-	vim && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
-
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
-    cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
-    make -j $(nproc) && \
-    cd /opt/kaldi/src && \
-    ./configure --shared && \
-    make depend -j $(nproc) && \
-    make -j $(nproc) && \
-    find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-    rm -rf /opt/kaldi/.git
-WORKDIR /opt/kaldi/
-
diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu16.04-gpu/Dockerfile
deleted file mode 100644
index 41fc78beb83..00000000000
--- a/docker/ubuntu16.04-gpu/Dockerfile
+++ /dev/null
@@ -1,44 +0,0 @@
-
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
-LABEL maintainer="mdoulaty@gmail.com"
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        g++ \
-        make \
-        automake \
-        autoconf \
-        bzip2 \
-        unzip \
-        wget \
-        sox \
-        libtool \
-        git \
-        subversion \
-        python2.7 \
-        python3 \
-        zlib1g-dev \
-        gfortran \
-        ca-certificates \
-        patch \
-        ffmpeg \
-	vim && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
-
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
-    cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
-    make -j $(nproc) && \
-    cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
-    make depend -j $(nproc) && \
-    make -j $(nproc) && \
-    find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-    rm -rf /opt/kaldi/.git
-
-WORKDIR /opt/kaldi/
-
diff --git a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0 b/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
deleted file mode 100644
index 41fc78beb83..00000000000
--- a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
+++ /dev/null
@@ -1,44 +0,0 @@
-
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
-LABEL maintainer="mdoulaty@gmail.com"
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        g++ \
-        make \
-        automake \
-        autoconf \
-        bzip2 \
-        unzip \
-        wget \
-        sox \
-        libtool \
-        git \
-        subversion \
-        python2.7 \
-        python3 \
-        zlib1g-dev \
-        gfortran \
-        ca-certificates \
-        patch \
-        ffmpeg \
-	vim && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
-
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
-    cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
-    make -j $(nproc) && \
-    cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
-    make depend -j $(nproc) && \
-    make -j $(nproc) && \
-    find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-    rm -rf /opt/kaldi/.git
-
-WORKDIR /opt/kaldi/
-
diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu20.04-cuda11/Dockerfile
similarity index 57%
rename from docker/ubuntu22.04-cuda12.2.0/Dockerfile
rename to docker/ubuntu20.04-cuda11/Dockerfile
index ae413def077..81126cd96ac 100644
--- a/docker/ubuntu22.04-cuda12.2.0/Dockerfile
+++ b/docker/ubuntu20.04-cuda11/Dockerfile
@@ -1,8 +1,9 @@
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
-LABEL maintainer="williamhilton.works@gmail.com"
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+LABEL maintainer="jtrmal@apptek.com"
 
+ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
+    apt-get install -yqq --no-install-recommends \
         build-essential \
         g++ \
         make \
@@ -10,34 +11,27 @@ RUN apt-get update && \
         bzip2 \
         unzip \
         wget \
-        sox \
         libtool \
         git \
-        subversion \
-        python2.7 \
         python3 \
         zlib1g-dev \
         ca-certificates \
         gfortran \
         patch \
-        ffmpeg \
-        vim && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends\
+        sox \
         software-properties-common && \
-    apt-add-repository multiverse && \
-    apt-get update && \
-    yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
-        intel-mkl && \
+        apt-add-repository multiverse && \
+        apt-get update && \
+        yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
+            intel-mkl && \
     rm -rf /var/lib/apt/lists/*
 
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
     cd /opt/kaldi/tools && \
     make -j $(nproc) && \
     cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
+    ./configure --shared --use-cuda=yes && \
     make depend -j $(nproc) && \
     make -j $(nproc) && \
     find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
diff --git a/docker/ubuntu18.04-cuda10.0/Dockerfile b/docker/ubuntu22.04-cuda12/Dockerfile
similarity index 61%
rename from docker/ubuntu18.04-cuda10.0/Dockerfile
rename to docker/ubuntu22.04-cuda12/Dockerfile
index 0c75863fedd..cb12b6abdd0 100644
--- a/docker/ubuntu18.04-cuda10.0/Dockerfile
+++ b/docker/ubuntu22.04-cuda12/Dockerfile
@@ -1,44 +1,39 @@
-
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
-LABEL maintainer="mdoulaty@gmail.com"
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04
+LABEL maintainer="jtrmal@apptek.com"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
+        build-essential \
         g++ \
         make \
         automake \
-        autoconf \
         bzip2 \
         unzip \
         wget \
-        sox \
         libtool \
         git \
-        subversion \
-        python2.7 \
         python3 \
         zlib1g-dev \
-        gfortran \
         ca-certificates \
+        gfortran \
         patch \
-        ffmpeg \
-	vim && \
+        sox \
+        software-properties-common && \
+        apt-add-repository multiverse && \
+        apt-get update && \
+        yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
+            intel-mkl && \
     rm -rf /var/lib/apt/lists/*
 
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
     cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
     make -j $(nproc) && \
     cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
+    ./configure --shared --use-cuda=yes && \
     make depend -j $(nproc) && \
     make -j $(nproc) && \
     find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
     rm -rf /opt/kaldi/.git
 
 WORKDIR /opt/kaldi/
-