From de3632d3a35c0a3bc942c403f073c30fa897386c Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Wed, 7 Sep 2022 13:43:58 +0100 Subject: [PATCH 01/76] Insert build system changes. --- src/Makefile | 6 +++ src/chain/Makefile | 13 +++++- src/configure | 79 ++++++++++++++++++++++++++++++++-- src/cudamatrix/Makefile | 13 +++++- src/makefiles/default_rules.mk | 10 ++++- src/nnet3/Makefile | 7 ++- src/nnet3bin/Makefile | 6 +++ 7 files changed, 127 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4d4efbc0172..bc4375e30f6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER) endif endif +ifeq ($(ROCM), true) +ifeq ($(WITH_CUDADECODER), true) +SUBDIRS += $(CUDADECODER) +endif +endif + SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS)) SUBDIRS_BIN = $(filter %bin, $(SUBDIRS)) diff --git a/src/chain/Makefile b/src/chain/Makefile index fbad28f7de6..c4411f4b997 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o \ chain-generic-numerator.o -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) OBJFILES += chain-kernels.o endif @@ -28,7 +28,18 @@ ifeq ($(CUDA), true) endif # Implicit rule for kernel compilation, +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ +endif +ifeq ($(ROCM), true) +#%.hip : %.cu +# $(HIPIFY) $< 1> $@ 2> $@.stats +#%.o : %.hip +# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endif + include ../makefiles/default_rules.mk diff --git a/src/configure b/src/configure index ed627eceedc..feb2fd276ad 100755 --- a/src/configure +++ b/src/configure @@ -74,6 +74,9 @@ Configuration options: --cudatk-dir=DIR CUDA toolkit directory --cuda-arch=FLAGS Override the default CUDA_ARCH flags. See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples. + --use-rocm Build with ROCm + --rocm-dir=DIR ROCM directory + --rocm-targets=TGTS Comma separated list of GPU targets to target through ROCm --debug-level=N Use assertion level 0 (disabled), 1, or 2 [default=1] --double-precision Build with BaseFloat set to double if yes [default=no], mostly useful for testing purposes. @@ -248,6 +251,63 @@ function check_for_slow_expf { fi } +# ROCM is used only in selected directories including src/cudamatrix, src/nnet* +# and src/chain*. It is used to accelerate the neural network training. +# The rest of Kaldi runs on CPUs. + +function configure_rocm { + # Check for ROCM in the system + if [ ! -d "$ROCMDIR" ]; then + for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do + if [ -f $base/bin/hipcc ]; then + ROCMDIR=$base + fi + done + fi + + if [ -d "$ROCMDIR" ]; then + if [ ! -f $ROCMDIR/bin/hipcc ]; then + failure "Cannnot find hipcc in ROCm directory $ROCMDIR" + fi + fi + echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)" + echo >> kaldi.mk + echo "# ROCm configuration" >> kaldi.mk + echo >> kaldi.mk + echo IS_GPU_BUILD = true >> kaldi.mk + echo ROCM = true">> kaldi.mk + echo "ROCMDIR = $ROCMDIR" >> kaldi.mk + echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk + + echo "CUDA_ARCH = " >> kaldi.mk + echo "ROCM_ARCH_FLAGS = " >> kaldi.mk + for i in ${ROCM_TARGETS//,/ } ; do + echo "Targetting ROCm arch $i" + echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk + done + + echo "HOST_ARCH = `uname -m`" >> kaldi.mk + echo >> kaldi.mk + + # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, + # use direct calls to uname -m here + if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then + cat makefiles/hip_64bit.mk >> kaldi.mk + else + echo "\ +WARNING: ROCM will not be used! + ROCM is only supported with 64-bit Linux builds." + exit 1; + fi + + #add cusolver flags for newer toolkits + if [ "$CUSOLVER" == "true" ]; then + echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk + fi +} + + + # CUDA is used only in selected directories including src/cudamatrix, src/nnet* # and src/chain*. It is used to accelerate the neural network training. # The rest of Kaldi runs on CPUs. @@ -371,6 +431,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ echo "# CUDA configuration" >> kaldi.mk echo >> kaldi.mk + echo IS_GPU_BUILD = true >> kaldi.mk echo CUDA = true >> kaldi.mk echo CUDATKDIR = $CUDATKDIR >> kaldi.mk echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk @@ -602,7 +663,8 @@ ENV_LDLIBS=$LDLIBS debug_level=1 double_precision=false dynamic_kaldi=false -use_cuda=true +use_cuda=false +use_rocm=false with_cudadecoder=true static_fst=false static_math=false @@ -651,8 +713,11 @@ do --atlas-root=*) GetSwitchExistingPathOrDie ATLASROOT "$1" shift ;; - --use-cuda) - use_cuda=true; + --use-rocm) + use_rocm=true; + shift ;; + --use-rocm=no) + use_rocm=false; shift ;; --use-cuda=yes) use_cuda=true; @@ -729,6 +794,13 @@ do --mathlib=*) GetSwitchValueOrDie MATHLIB "$1" shift ;; + --rocm-dir=*) + # ROCM is used in src/cudamatrix and src/nnet{,bin} only. + GetSwitchExistingPathOrDie ROCMDIR "$1" + shift ;; + --rocm-targets=*) + GetSwitchValueOrDie ROCM_TARGETS "$1" + shift ;; --cudatk-dir=*) # CUDA is used in src/cudamatrix and src/nnet{,bin} only. GetSwitchExistingPathOrDie CUDATKDIR "$1" @@ -1304,6 +1376,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)." failure "Unsupported linear algebra library '$MATHLIB'" fi $use_cuda && configure_cuda + $use_rocm && configure_rocm linux_configure_speex else failure "Could not detect the platform or we have not yet worked out the diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 45c2ba44fd7..31c7c5ef3e5 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \ cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) OBJFILES += cu-kernels.o endif @@ -27,8 +27,19 @@ ifeq ($(CUDA), true) endif endif +ifeq ($(CUDA), true) # Implicit rule for kernel compilation, %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ +endif + +ifeq ($(ROCM), true) +#%.hip : %.cu +# $(HIPIFY) $< 1> $@ 2> $@.stats +#%.o : %.hip +# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endifn include ../makefiles/default_rules.mk diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk index 3ae5ed5e2dd..c27b7b0a108 100644 --- a/src/makefiles/default_rules.mk +++ b/src/makefiles/default_rules.mk @@ -145,12 +145,17 @@ ifneq ($(CC_SRCS),) CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS) endif -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_ENABLED), true) CUDA_SRCS=$(wildcard *.cu) # Check if any CUDA .cu sources exist to run dependency commands on. ifneq ($(CUDA_SRCS),) +ifeq ($(CUDA), true) NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS) endif +ifeq ($(ROCM), true) +HIPCC_DEP_COMMAND = $(HIPCC) -M $(ROCM_FLAGS) $(ROCM_INCLUDE) $(CUDA_SRCS) +endif +endif endif .PHONY: depend @@ -162,6 +167,9 @@ endif ifneq ($(NVCC_DEP_COMMAND),) -$(NVCC_DEP_COMMAND) >> .depend.mk endif +ifneq ($(HIPCC_DEP_COMMAND),) + -$(HIPCC_DEP_COMMAND) >> .depend.mk +endif # removing automatic making of "depend" as it's quite slow. #.depend.mk: depend diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 0bf1bebe096..b6c75ac7118 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -3,9 +3,14 @@ all: include ../kaldi.mk +ifeq ($(CUDA), true) LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) - +endif +ifeq ($(ROCM), true) +LDFLAGS += $(ROCM_LDFLAGS) +LDLIBS += $(ROCM_LDLIBS) +endif TESTFILES = natural-gradient-online-test nnet-graph-test \ nnet-descriptor-test nnet-parse-test nnet-component-test \ diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile index 039fc258b13..2bd23273982 100644 --- a/src/nnet3bin/Makefile +++ b/src/nnet3bin/Makefile @@ -3,8 +3,14 @@ all: EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk +ifeq ($(CUDA), true) LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) +endif +ifeq ($(ROCM), true) +LDFLAGS += $(ROCM_LDFLAGS) +LDLIBS += $(ROCM_LDLIBS) +endif BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \ nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \ From 64c27545ce49357fe900de377eb266e9fe11f46d Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Wed, 7 Sep 2022 10:03:38 -0500 Subject: [PATCH 02/76] Remove extra quote. --- src/configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/configure b/src/configure index feb2fd276ad..21e439eeb4b 100755 --- a/src/configure +++ b/src/configure @@ -275,7 +275,7 @@ function configure_rocm { echo "# ROCm configuration" >> kaldi.mk echo >> kaldi.mk echo IS_GPU_BUILD = true >> kaldi.mk - echo ROCM = true">> kaldi.mk + echo ROCM = true >> kaldi.mk echo "ROCMDIR = $ROCMDIR" >> kaldi.mk echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk From ee18146a6ce723de6c26a78890f6e83b484c0460 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Thu, 8 Sep 2022 07:05:47 -0500 Subject: [PATCH 03/76] Add hipify header. --- src/configure | 3 +- src/cudamatrix/Makefile | 4 +- src/cudamatrix/cu-device.cc | 8 +- src/cudamatrix/cu-kernels.cu | 9 ++- src/hip/hipify.h | 22 +++++ src/hip/math_constants.h | 152 +++++++++++++++++++++++++++++++++++ src/makefiles/hip_64bit.mk | 21 +++++ 7 files changed, 214 insertions(+), 5 deletions(-) create mode 100644 src/hip/hipify.h create mode 100644 src/hip/math_constants.h create mode 100644 src/makefiles/hip_64bit.mk diff --git a/src/configure b/src/configure index 21e439eeb4b..fa0b77373a0 100755 --- a/src/configure +++ b/src/configure @@ -258,9 +258,10 @@ function check_for_slow_expf { function configure_rocm { # Check for ROCM in the system if [ ! -d "$ROCMDIR" ]; then - for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do + for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do if [ -f $base/bin/hipcc ]; then ROCMDIR=$base + break fi done fi diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 31c7c5ef3e5..512028c6c13 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -39,7 +39,7 @@ ifeq ($(ROCM), true) #%.o : %.hip # $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ %.o : %.cu - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -endifn + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 39bcf373ace..5bcb0552924 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -23,10 +23,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#else #include #include #include - +#endif // __IS_HIP_COMPILE__ #include #include #include diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8044ff699bc..c644cbc0784 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -28,10 +28,17 @@ #include #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "cudamatrix/cu-kernels-ansi.h" +#include +#include +#else #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION - +#endif //__IS_HIP_COMPILE__ /*********************************************************************** * Generic __device__ functions diff --git a/src/hip/hipify.h b/src/hip/hipify.h new file mode 100644 index 00000000000..41b7a02cb04 --- /dev/null +++ b/src/hip/hipify.h @@ -0,0 +1,22 @@ +#ifndef __HIPIFY_H__ +#define __HIPIFY_H__ + +inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} + +// +// HIP types +// +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaGetDevice hipGetDevice +#define cudaStream_t hipStream_t +#define cudaStreamLegacy ((hipStream_t)1) +#define cudaStreamPerThread ((hipStream_t)2) + +// +// HIPCUB +// +#define cub hipcub + + +#endif //__HIPIFY_H__ diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h new file mode 100644 index 00000000000..7fb8fce8e71 --- /dev/null +++ b/src/hip/math_constants.h @@ -0,0 +1,152 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__MATH_CONSTANTS_H__) +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ +#define CUDART_INF_F __int_as_float(0x7f800000) +#define CUDART_NAN_F __int_as_float(0x7fffffff) +#define CUDART_MIN_DENORM_F __int_as_float(0x00000001) +#define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define CUDART_NEG_ZERO_F __int_as_float(0x80000000) +#define CUDART_ZERO_F 0.0f +#define CUDART_ONE_F 1.0f +#define CUDART_SQRT_HALF_F 0.707106781f +#define CUDART_SQRT_HALF_HI_F 0.707106781f +#define CUDART_SQRT_HALF_LO_F 1.210161749e-08f +#define CUDART_SQRT_TWO_F 1.414213562f +#define CUDART_THIRD_F 0.333333333f +#define CUDART_PIO4_F 0.785398163f +#define CUDART_PIO2_F 1.570796327f +#define CUDART_3PIO4_F 2.356194490f +#define CUDART_2_OVER_PI_F 0.636619772f +#define CUDART_SQRT_2_OVER_PI_F 0.797884561f +#define CUDART_PI_F 3.141592654f +#define CUDART_L2E_F 1.442695041f +#define CUDART_L2T_F 3.321928094f +#define CUDART_LG2_F 0.301029996f +#define CUDART_LGE_F 0.434294482f +#define CUDART_LN2_F 0.693147181f +#define CUDART_LNT_F 2.302585093f +#define CUDART_LNPI_F 1.144729886f +#define CUDART_TWO_TO_M126_F 1.175494351e-38f +#define CUDART_TWO_TO_126_F 8.507059173e37f +#define CUDART_NORM_HUGE_F 3.402823466e38f +#define CUDART_TWO_TO_23_F 8388608.0f +#define CUDART_TWO_TO_24_F 16777216.0f +#define CUDART_TWO_TO_31_F 2147483648.0f +#define CUDART_TWO_TO_32_F 4294967296.0f +#define CUDART_REMQUO_BITS_F 3 +#define CUDART_REMQUO_MASK_F (~((~0)< Date: Thu, 8 Sep 2022 18:07:47 -0500 Subject: [PATCH 04/76] Add more entries to hipificatiion header to deal with the BLAS routines. --- src/cudamatrix/cu-allocator.h | 7 ++ src/cudamatrix/cu-array-inl.h | 5 ++ src/cudamatrix/cu-common.h | 9 +++ src/cudamatrix/cu-device.h | 14 +++- src/cudamatrix/cu-matrix.cc | 6 ++ src/cudamatrix/cublas-wrappers.h | 17 ++-- src/hip/hipify.h | 129 +++++++++++++++++++++++++++++++ src/makefiles/hip_64bit.mk | 2 +- 8 files changed, 181 insertions(+), 8 deletions(-) diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index d7d65da806a..a3baa2fb33d 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -23,10 +23,17 @@ #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#else #include #include #include #endif +#endif #include #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 53de59fe4fc..36b829046ed 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -28,7 +28,12 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#else #include +#endif #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 83f8a39a8b9..617f4363269 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -31,11 +31,20 @@ #if HAVE_CUDA +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +//TODO: tests with ROCTX #include +#include +#else #include #include #include #include #include +#endif #define CU_SAFE_CALL(fun) \ { \ diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 2f278eb85b9..515fa4d7d25 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -28,14 +28,26 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#include +#include +#else #include #include #include #include #include - +#endif #if CUDA_VERSION >= 9010 +#ifdef __IS_HIP_COMPILE__ +#include +#else #include +#endif #else // cusolver not supported. // Setting a few types to minimize compiler guards. diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c67842d38bf..a522f13451a 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -27,9 +27,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h index 63dbe630568..dc5c0e0ced5 100644 --- a/src/cudamatrix/cublas-wrappers.h +++ b/src/cudamatrix/cublas-wrappers.h @@ -28,14 +28,17 @@ namespace kaldi { #if HAVE_CUDA == 1 +#ifndef CUBLAS_R_32F +#define CUBLAS_R_32F CUDA_R_32F +#endif inline cublasStatus_t cublas_gemm( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { #if CUDA_VERSION >= 11000 - return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta, - C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), + return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta, + C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); @@ -63,8 +66,8 @@ inline cublasStatus_t cublas_gemmBatched( const float *A[], int lda, const float *B[], int ldb, float beta, float *C[], int ldc, int batchCount) { #if CUDA_VERSION >= 11000 - return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F, lda, - (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount, + return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F, lda, + (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); @@ -219,6 +222,7 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, // cuSPARSE wrappers // #if CUDA_VERSION >= 10020 +#ifndef __IS_HIP_COMPILE__ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal, const int *csrRowPtr, @@ -243,6 +247,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, return status; } +#endif inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle, cusparseOperation_t transA, @@ -319,7 +324,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int *cscRowInd, int *cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) { -#if CUDA_VERSION >= 10020 +#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__) return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues, idxBase); @@ -336,7 +341,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int *cscRowInd, int *cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) { -#if CUDA_VERSION >= 10020 +#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__) return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues, idxBase); diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 41b7a02cb04..697afc7a6d3 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -5,14 +5,143 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // // HIP types +// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops. // + #define cudaDevAttrWarpSize hipDeviceAttributeWarpSize #define cudaDeviceGetAttribute hipDeviceGetAttribute #define cudaGetDevice hipGetDevice +#define cudaGetErrorString hipGetErrorString #define cudaStream_t hipStream_t #define cudaStreamLegacy ((hipStream_t)1) #define cudaStreamPerThread ((hipStream_t)2) +#define cublasStatus_t hipblasStatus_t +#define cudaError_t hipError_t +#define cusparseDestroy hipsparseDestroy +#define cudaGetLastError hipGetLastError +#define cudaFree hipFree +#define cudaGetErrorString hipGetErrorString +#define cublasCreate hipblasCreate +#define cublasSetStream hipblasSetStream +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define curandCreateGenerator hiprandCreateGenerator +#define curandSetStream hiprandSetStream +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaGetDeviceProperties hipGetDeviceProperties +#define curandDestroyGenerator hiprandDestroyGenerator +#define cusparseDestroy hipsparseDestroy +#define cudaDeviceProp hipDeviceProp_t +#define cublasOperation_t hipblasOperation_t +#define cublasStatus_t hipblasStatus_t +#define cusparseStatus_t hipsparseStatus_t +#define curandStatus_t hiprandStatus_t +#define cublasHandle_t hipblasHandle_t +#define cusparseHandle_t hipsparseHandle_t +#define curandGenerator_t hiprandGenerator_t +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cublasComputeType_t hipblasDatatype_t +#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed +#define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cublasDaxpy_v2 hipblasDaxpy +#define cublasSaxpy_v2 hipblasSaxpy +#define cublasDscal_v2 hipblasDscal +#define cublasSscal_v2 hipblasSscal +#define cudaSetDevice hipSetDevice +#define cudaSuccess hipSuccess +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnSetStream hipsolverDnSetStream +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define cusparseCreate hipsparseCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusparseSetStream hipsparseSetStream +#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT +#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaDeviceReset hipDeviceReset +#define cudaComputeModeExclusive hipComputeModeExclusive +#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess +#define cudaErrorInvalidDevice hipErrorInvalidDevice +#define cublasDestroy hipblasDestroy +#define cuDeviceGetName hipDeviceGetName +#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse +#define curandGenerateUniform hiprandGenerateUniform +#define curandGenerateUniformDouble hiprandGenerateUniformDouble +#define curandGenerateNormal hiprandGenerateNormal +#define curandGenerateNormalDouble hiprandGenerateNormalDouble +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define cusparseMatDescr_t hipsparseMatDescr_t +#define cudaMemsetAsync hipMemsetAsync +#define cublasGemmEx hipblasGemmEx +#define cublasDgemm_v2 hipblasDgemm +#define cublasSger_v2 hipblasSger +#define cublasDger_v2 hipblasDger +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasDgemmBatched hipblasDgemmBatched +#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define cublasFillMode_t hipblasFillMode_t +#define cublasSsyrk_v2 hipblasSsyrk +#define cublasDsyrk_v2 hipblasDsyrk +#define cublasSdot_v2 hipblasSdot +#define cublasSasum_v2 hipblasSasum +#define cublasDnrm2_v2 hipblasDnrm2 +#define cublasScopy_v2 hipblasScopy +#define cublasDcopy_v2 hipblasDcopy +#define cublasSgemv_v2 hipblasSgemv +#define cublasDgemv_v2 hipblasDgemv +#define cublasSspmv_v2 hipblasSspmv +#define cublasDspmv_v2 hipblasDspmv +#define cublasDtpmv_v2 hipblasDtpmv +#define cublasSspr_v2 hipblasSspr +#define cublasDspr_v2 hipblasDspr +#define cudaDataType hipDataType +#define cusparseAction_t hipsparseAction_t +#define cublasDdot_v2 hipblasDdot +#define cublasDasum_v2 hipblasDasum +#define cublasSnrm2_v2 hipblasSnrm2 +#define cublasStpmv_v2 hipblasStpmv +#define cusparseIndexBase_t hipsparseIndexBase_t +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define cusparseOperation_t hipsparseOperation_t +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseGetMatIndexBase hipsparseGetMatIndexBase +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseDnMatDescr_t hipsparseDnMatDescr_t +#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN +#define cusparseCreateDnMat hipsparseCreateDnMat +#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 +#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize +#define cusparseSpMM hipsparseSpMM +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDestroyDnMat hipsparseDestroyDnMat +#define cusparseScsr2csc hipsparseScsr2csc +#define CUDA_R_64F HIP_R_64F +#define CUDA_R_32F HIP_R_32F +#define CUBLAS_R_64F HIPBLAS_R_64F +#define CUBLAS_R_32F HIPBLAS_R_32F +#define cusparseDcsr2csc hipsparseDcsr2csc +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_OP_N HIPBLAS_OP_N +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemset2DAsync hipMemset2DAsync // // HIPCUB // diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 453d9d5fe62..b405d84a15b 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -5,7 +5,7 @@ ifndef ROCMDIR $(error ROCMDIR not defined.) endif -CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 \ +CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC) From 07f2f36e398aa09a59a6655c212f8c1233f81216 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Thu, 8 Sep 2022 18:36:28 -0500 Subject: [PATCH 05/76] Cudmatrix hipification complete. --- src/cudamatrix/cu-allocator.cc | 7 +++++ src/cudamatrix/cu-array.cc | 5 +++ src/cudamatrix/cu-block-matrix.cc | 6 ++++ src/cudamatrix/cu-common.cc | 5 +++ src/cudamatrix/cu-compressed-matrix.cc | 6 ++++ src/cudamatrix/cu-packed-matrix.cc | 6 ++++ src/cudamatrix/cu-sp-matrix.cc | 6 ++++ src/cudamatrix/cu-sparse-matrix.cc | 6 ++++ src/cudamatrix/cu-tp-matrix.cc | 6 ++++ src/cudamatrix/cu-vector.cc | 6 ++++ src/hip/hipify.h | 42 ++++++++++++++++++++++++++ 11 files changed, 101 insertions(+) diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index e438c604509..8e08d3ef2a1 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -23,9 +23,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #include +#endif + #include #include diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 53eccdd44c5..2017ebce5c7 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -22,8 +22,13 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#else #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index e0c64912207..a2bd910eba0 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -19,9 +19,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include #include "base/timer.h" diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 10fc00da681..585d980ed19 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -22,7 +22,12 @@ #include "cudamatrix/cu-common.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#else #include +#endif #include "base/kaldi-common.h" #include "cudamatrix/cu-matrixdim.h" diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index be02921169d..0a5537b4248 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -19,9 +19,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 756d580c7cf..f0563a6123f 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -21,9 +21,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index d1efc0cff9c..a328457ca11 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -19,9 +19,15 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 703aa40e735..c0ebddfc95e 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -22,9 +22,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 377c34239f0..6929911fb5e 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -19,9 +19,15 @@ // limitations under the License. #if HAVE_CUDA==1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 8736782a3e0..fa5d94fb0bc 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -22,9 +22,15 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 697afc7a6d3..10010ceb70f 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -139,9 +139,51 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusparseDestroyMatDescr hipsparseDestroyMatDescr #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_C HIPBLAS_OP_C #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemset2DAsync hipMemset2DAsync +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN +#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED +#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED +#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE +#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH +#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR +#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED +#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED +#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT +#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED +#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define cudaMalloc hipMalloc +#define cudaMallocPitch hipMallocPitch +#define cuMemGetInfo_v2 hipMemGetInfo + // // HIPCUB // From fde6f7f478ce18af0142885fd625a33ce2946671 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 06:54:00 -0500 Subject: [PATCH 06/76] Ignore Eclipse synchronized project files. --- .gitignore | 4 ++++ src/chain/Makefile | 2 +- src/chain/chain-kernels-ansi.h | 4 ++++ src/chain/chain-kernels.cu | 5 +++++ src/makefiles/hip_64bit.mk | 8 +++----- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 9f8c727d4d0..53a4079d9ef 100644 --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,7 @@ venv/ # CMakeLists.txt files are currently autogenerated, must not be committed. /src/**/CMakeLists.txt /build* + +# Eclipse sync project +.ptp-sync +.ptp-sync-folder diff --git a/src/chain/Makefile b/src/chain/Makefile index c4411f4b997..678bb03ef33 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -38,7 +38,7 @@ ifeq ($(ROCM), true) #%.o : %.hip # $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ %.o : %.cu - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index f5814d7c11c..48c80cc8d92 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -22,6 +22,10 @@ #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_ #include "chain/chain-datastruct.h" +#ifdef __IS_HIP_COMPILE__ +#include +#endif + #if HAVE_CUDA == 1 extern "C" { diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index a63944f0012..739b9005854 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -20,6 +20,11 @@ #include #include "chain/chain-kernels-ansi.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#endif + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \ configure with --use-cuda=no (this will disable the use of GPU). diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index b405d84a15b..6ca4ea7d1b6 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -13,9 +13,7 @@ ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread -#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64 -#CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib -ROCM_LDFLAGS += - +#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. +CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib #CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt -ROCM_LDLIBS += +CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64 From 21ca60dfeeee2496801869ee96667cfd73df4aa6 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 08:02:20 -0500 Subject: [PATCH 07/76] Hipify complete including NVTX. --- src/chain/chain-kernels.cu | 1 - src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.cc | 84 ++++++++++++++------------ src/cudamatrix/cu-common.h | 2 +- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 2 +- src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/makefiles/hip_64bit.mk | 7 ++- 16 files changed, 65 insertions(+), 53 deletions(-) diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 739b9005854..2a30128750c 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -22,7 +22,6 @@ #ifdef __IS_HIP_COMPILE__ #include -#include #endif #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index 8e08d3ef2a1..82d682588d8 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index a3baa2fb33d..0cc1f7e6a4b 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index a2bd910eba0..04885296445 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 585d980ed19..6275bc9073a 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -25,8 +25,10 @@ #ifdef __IS_HIP_COMPILE__ #include #include +#define API_NAME_PREFIX "HIP" #else #include +#define API_NAME_PREFIX "CU" #endif #include "base/kaldi-common.h" @@ -36,6 +38,9 @@ namespace kaldi { #ifdef USE_NVTX NvtxTracer::NvtxTracer(const char* name) { +#ifdef __IS_HIP_COMPILE__ + roctxRangePushA(name); +#else const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; const int num_colors = sizeof(colors)/sizeof(uint32_t); int color_id = ((int)name[0])%num_colors; @@ -48,9 +53,14 @@ NvtxTracer::NvtxTracer(const char* name) { eventAttrib.message.ascii = name; nvtxRangePushEx(&eventAttrib); // nvtxRangePushA(name); +#endif } NvtxTracer::~NvtxTracer() { +#ifdef __IS_HIP_COMPILE__ + roctxRangePop(); +#else nvtxRangePop(); +#endif } #endif @@ -92,16 +102,16 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, const char* cublasGetStatusStringK(cublasStatus_t status) { // Defined in CUDA include file: cublas.h or cublas_api.h switch(status) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_SUCCESS: return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; } return "CUBLAS_STATUS_UNKNOWN_ERROR"; } @@ -110,43 +120,43 @@ const char* cusparseGetStatusString(cusparseStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust // Defined in CUDA include file: cusparse.h switch(status) { - case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; + case CUSPARSE_STATUS_SUCCESS: return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; #if CUDA_VERSION >= 11000 - case CUSPARSE_STATUS_NOT_SUPPORTED: return "CUSPARSE_STATUS_NOT_SUPPORTED"; - case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES"; + case CUSPARSE_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; + case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; #endif } - return "CUSPARSE_STATUS_UNKNOWN_ERROR"; + return "SPARSE_STATUS_UNKNOWN_ERROR"; } const char* curandGetStatusString(curandStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html // Defined in CUDA include file: curand.h switch(status) { - case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_SUCCESS: return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; } - return "CURAND_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; } } // namespace kaldi diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 617f4363269..a0c879414d4 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -36,7 +36,7 @@ #include #include #include -//TODO: tests with ROCTX #include +#include #include #else #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index 0a5537b4248..de4fe6f8da2 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 5bcb0552924..41f8d6f83d5 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 515fa4d7d25..9286b6fe14a 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -44,7 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#include +#include #else #include #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index a522f13451a..675ed74aeb4 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index f0563a6123f..5acfc7443c4 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a328457ca11..adfb3e0b517 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index c0ebddfc95e..45742571a41 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 6929911fb5e..51fb744a855 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index fa5d94fb0bc..62ff16cb7f9 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include #else #include diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 6ca4ea7d1b6..0ff628d67f6 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -5,11 +5,14 @@ ifndef ROCMDIR $(error ROCMDIR not defined.) endif -CXXFLAGS += -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ + +ROCM_USEROCTX = -DUSE_NVTX + +CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC) -ROCM_FLAGS = -fPIC -DHAVE_CUDA=1 \ +ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread From 104023482690fbdc92d1cb190a85de8b697f86be Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 09:21:01 -0500 Subject: [PATCH 08/76] Format files for the hipification. --- src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-array-inl.h | 2 +- src/cudamatrix/cu-array.cc | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.cc | 13 +- src/cudamatrix/cu-common.h | 2 +- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 2 +- src/cudamatrix/cu-kernels.cu | 2 +- src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/hip/hipify.h | 347 ++++++++++++------------- src/makefiles/hip_64bit.mk | 5 +- 19 files changed, 198 insertions(+), 199 deletions(-) diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index 82d682588d8..abd08a9b015 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -26,7 +26,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 0cc1f7e6a4b..1ed7e54b541 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -27,7 +27,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 36b829046ed..1fd80502cf9 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -30,7 +30,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #else #include #endif diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 2017ebce5c7..333e8fbed1c 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #else #include #endif diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 04885296445..fd17fe61893 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 6275bc9073a..2e77062f20d 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -24,7 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #define API_NAME_PREFIX "HIP" #else #include @@ -112,8 +112,12 @@ const char* cublasGetStatusStringK(cublasStatus_t status) { case CUBLAS_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; case CUBLAS_STATUS_LICENSE_ERROR: return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; +#ifdef __IS_HIP_COMPILE__ + case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_INVALID_ENUM: return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; +#endif } - return "CUBLAS_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR"; } const char* cusparseGetStatusString(cusparseStatus_t status) { @@ -135,7 +139,7 @@ const char* cusparseGetStatusString(cusparseStatus_t status) { case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; #endif } - return "SPARSE_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR"; } const char* curandGetStatusString(curandStatus_t status) { @@ -155,6 +159,9 @@ const char* curandGetStatusString(curandStatus_t status) { case CURAND_STATUS_INITIALIZATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; case CURAND_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; case CURAND_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; +#ifdef __IS_HIP_COMPILE__ + case HIPRAND_STATUS_NOT_IMPLEMENTED: return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; +#endif } return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; } diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index a0c879414d4..da7c57bde36 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -37,7 +37,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index de4fe6f8da2..e42c93f1b67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 41f8d6f83d5..705bfbeee59 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -27,7 +27,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 9286b6fe14a..d7edf5a5a1c 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index c644cbc0784..9a99f19b58f 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -30,7 +30,7 @@ #include #ifdef __IS_HIP_COMPILE__ #include -#include +#include "hipify.h" #include "cudamatrix/cu-kernels-ansi.h" #include #include diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 675ed74aeb4..c1d72ede87e 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -30,7 +30,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 5acfc7443c4..c9d686d0ce8 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -24,7 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index adfb3e0b517..a6c7d7720e4 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 45742571a41..a21e5163701 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 51fb744a855..378cc8e4e38 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -22,7 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 62ff16cb7f9..cf13d631a0d 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include -#include +#include "hipify.h" #else #include #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 10010ceb70f..89daad6bc28 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -5,187 +5,180 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // // HIP types -// TODO: Verify that HIPBLAS_R_32F and HIPBLAS_GEMM_DEFAULT can be sensible replacements for tensor ops. // - -#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize -#define cudaDeviceGetAttribute hipDeviceGetAttribute -#define cudaGetDevice hipGetDevice -#define cudaGetErrorString hipGetErrorString -#define cudaStream_t hipStream_t -#define cudaStreamLegacy ((hipStream_t)1) -#define cudaStreamPerThread ((hipStream_t)2) -#define cublasStatus_t hipblasStatus_t -#define cudaError_t hipError_t -#define cusparseDestroy hipsparseDestroy -#define cudaGetLastError hipGetLastError - -#define cudaFree hipFree -#define cudaGetErrorString hipGetErrorString -#define cublasCreate hipblasCreate -#define cublasSetStream hipblasSetStream -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define curandCreateGenerator hiprandCreateGenerator -#define curandSetStream hiprandSetStream -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaGetDeviceProperties hipGetDeviceProperties -#define curandDestroyGenerator hiprandDestroyGenerator -#define cusparseDestroy hipsparseDestroy -#define cudaDeviceProp hipDeviceProp_t -#define cublasOperation_t hipblasOperation_t -#define cublasStatus_t hipblasStatus_t -#define cusparseStatus_t hipsparseStatus_t -#define curandStatus_t hiprandStatus_t -#define cublasHandle_t hipblasHandle_t -#define cusparseHandle_t hipsparseHandle_t -#define curandGenerator_t hiprandGenerator_t -#define cublasGemmAlgo_t hipblasGemmAlgo_t -#define cusolverDnHandle_t hipsolverDnHandle_t -#define cublasComputeType_t hipblasDatatype_t -#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed -#define curandSetGeneratorOffset hiprandSetGeneratorOffset -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cublasDaxpy_v2 hipblasDaxpy -#define cublasSaxpy_v2 hipblasSaxpy -#define cublasDscal_v2 hipblasDscal -#define cublasSscal_v2 hipblasSscal -#define cudaSetDevice hipSetDevice -#define cudaSuccess hipSuccess -#define cusolverDnCreate hipsolverDnCreate -#define cusolverDnSetStream hipsolverDnSetStream -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define cusparseCreate hipsparseCreate -#define cusolverDnDestroy hipsolverDnDestroy -#define cusparseSetStream hipsparseSetStream -#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT -#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaDeviceReset hipDeviceReset -#define cudaComputeModeExclusive hipComputeModeExclusive -#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess -#define cudaErrorInvalidDevice hipErrorInvalidDevice -#define cublasDestroy hipblasDestroy -#define cuDeviceGetName hipDeviceGetName -#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse -#define curandGenerateUniform hiprandGenerateUniform -#define curandGenerateUniformDouble hiprandGenerateUniformDouble -#define curandGenerateNormal hiprandGenerateNormal -#define curandGenerateNormalDouble hiprandGenerateNormalDouble -#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE -#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE -#define cusparseMatDescr_t hipsparseMatDescr_t -#define cudaMemsetAsync hipMemsetAsync -#define cublasGemmEx hipblasGemmEx -#define cublasDgemm_v2 hipblasDgemm -#define cublasSger_v2 hipblasSger -#define cublasDger_v2 hipblasDger -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasDgemmBatched hipblasDgemmBatched -#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT -#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT -#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define cublasFillMode_t hipblasFillMode_t -#define cublasSsyrk_v2 hipblasSsyrk -#define cublasDsyrk_v2 hipblasDsyrk -#define cublasSdot_v2 hipblasSdot -#define cublasSasum_v2 hipblasSasum -#define cublasDnrm2_v2 hipblasDnrm2 -#define cublasScopy_v2 hipblasScopy -#define cublasDcopy_v2 hipblasDcopy -#define cublasSgemv_v2 hipblasSgemv -#define cublasDgemv_v2 hipblasDgemv -#define cublasSspmv_v2 hipblasSspmv -#define cublasDspmv_v2 hipblasDspmv -#define cublasDtpmv_v2 hipblasDtpmv -#define cublasSspr_v2 hipblasSspr -#define cublasDspr_v2 hipblasDspr -#define cudaDataType hipDataType -#define cusparseAction_t hipsparseAction_t -#define cublasDdot_v2 hipblasDdot -#define cublasDasum_v2 hipblasDasum -#define cublasSnrm2_v2 hipblasSnrm2 -#define cublasStpmv_v2 hipblasStpmv -#define cusparseIndexBase_t hipsparseIndexBase_t -#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS -#define cusparseOperation_t hipsparseOperation_t -#define cusparseSpMatDescr_t hipsparseSpMatDescr_t -#define cusparseGetMatIndexBase hipsparseGetMatIndexBase -#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I -#define cusparseCreateCsr hipsparseCreateCsr -#define cusparseDnMatDescr_t hipsparseDnMatDescr_t -#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN -#define cusparseCreateDnMat hipsparseCreateDnMat -#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 -#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize -#define cusparseSpMM hipsparseSpMM -#define cusparseDestroySpMat hipsparseDestroySpMat -#define cusparseDestroyDnMat hipsparseDestroyDnMat -#define cusparseScsr2csc hipsparseScsr2csc -#define CUDA_R_64F HIP_R_64F -#define CUDA_R_32F HIP_R_32F -#define CUBLAS_R_64F HIPBLAS_R_64F -#define CUBLAS_R_32F HIPBLAS_R_32F -#define cusparseDcsr2csc hipsparseDcsr2csc -#define cusparseCreateMatDescr hipsparseCreateMatDescr -#define cusparseDestroyMatDescr hipsparseDestroyMatDescr -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_C HIPBLAS_OP_C -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemset2DAsync hipMemset2DAsync -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN -#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED -#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED -#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE -#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH -#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR -#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED -#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_C HIPBLAS_OP_C +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_R_32F HIPBLAS_R_32F +#define CUBLAS_R_64F HIPBLAS_R_64F +#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUDA_R_32F HIP_R_32F +#define CUDA_R_64F HIP_R_64F +#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN +#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 +#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED +#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH +#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED +#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES +#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE +#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR #define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT -#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED -#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES -#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS -#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH -#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED -#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED -#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR -#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE -#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE -#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED -#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE -#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR -#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC -#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO -#define cudaMalloc hipMalloc -#define cudaMallocPitch hipMallocPitch -#define cuMemGetInfo_v2 hipMemGetInfo +#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED +#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT +#define cuDeviceGetName hipDeviceGetName +#define cuMemGetInfo_v2 hipMemGetInfo +#define cublasComputeType_t hipblasDatatype_t +#define cublasCreate hipblasCreate +#define cublasDasum_v2 hipblasDasum +#define cublasDaxpy_v2 hipblasDaxpy +#define cublasDcopy_v2 hipblasDcopy +#define cublasDdot_v2 hipblasDdot +#define cublasDestroy hipblasDestroy +#define cublasDgemmBatched hipblasDgemmBatched +#define cublasDgemm_v2 hipblasDgemm +#define cublasDgemv_v2 hipblasDgemv +#define cublasDger_v2 hipblasDger +#define cublasDnrm2_v2 hipblasDnrm2 +#define cublasDscal_v2 hipblasDscal +#define cublasDspmv_v2 hipblasDspmv +#define cublasDspr_v2 hipblasDspr +#define cublasDsyrk_v2 hipblasDsyrk +#define cublasDtpmv_v2 hipblasDtpmv +#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define cublasFillMode_t hipblasFillMode_t +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasOperation_t hipblasOperation_t +#define cublasSasum_v2 hipblasSasum +#define cublasSaxpy_v2 hipblasSaxpy +#define cublasScopy_v2 hipblasScopy +#define cublasSdot_v2 hipblasSdot +#define cublasSetStream hipblasSetStream +#define cublasSgemv_v2 hipblasSgemv +#define cublasSger_v2 hipblasSger +#define cublasSnrm2_v2 hipblasSnrm2 +#define cublasSscal_v2 hipblasSscal +#define cublasSspmv_v2 hipblasSspmv +#define cublasSspr_v2 hipblasSspr +#define cublasSsyrk_v2 hipblasSsyrk +#define cublasStatus_t hipblasStatus_t +#define cublasStatus_t hipblasStatus_t +#define cublasStpmv_v2 hipblasStpmv +#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) +#define cudaComputeModeExclusive hipComputeModeExclusive +#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess +#define cudaDataType hipDataType +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceReset hipDeviceReset +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse +#define cudaErrorInvalidDevice hipErrorInvalidDevice +#define cudaError_t hipError_t +#define cudaFree hipFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocPitch hipMallocPitch +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemset2DAsync hipMemset2DAsync +#define cudaMemsetAsync hipMemsetAsync +#define cudaSetDevice hipSetDevice +#define cudaStreamLegacy ((hipStream_t)1) +#define cudaStreamPerThread ((hipStream_t)2) +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define curandCreateGenerator hiprandCreateGenerator +#define curandDestroyGenerator hiprandDestroyGenerator +#define curandGenerateNormal hiprandGenerateNormal +#define curandGenerateNormalDouble hiprandGenerateNormalDouble +#define curandGenerateUniform hiprandGenerateUniform +#define curandGenerateUniformDouble hiprandGenerateUniformDouble +#define curandGenerator_t hiprandGenerator_t +#define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed +#define curandSetStream hiprandSetStream +#define curandStatus_t hiprandStatus_t +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cusolverDnSetStream hipsolverDnSetStream +#define cusparseAction_t hipsparseAction_t +#define cusparseCreate hipsparseCreate +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseCreateDnMat hipsparseCreateDnMat +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDcsr2csc hipsparseDcsr2csc +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroyDnMat hipsparseDestroyDnMat +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDnMatDescr_t hipsparseDnMatDescr_t +#define cusparseGetMatIndexBase hipsparseGetMatIndexBase +#define cusparseHandle_t hipsparseHandle_t +#define cusparseIndexBase_t hipsparseIndexBase_t +#define cusparseMatDescr_t hipsparseMatDescr_t +#define cusparseOperation_t hipsparseOperation_t +#define cusparseScsr2csc hipsparseScsr2csc +#define cusparseSetStream hipsparseSetStream +#define cusparseSpMM hipsparseSpMM +#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseStatus_t hipsparseStatus_t // -// HIPCUB +// HIPCUB namespace. // #define cub hipcub diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 0ff628d67f6..0c558a770d6 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -5,8 +5,8 @@ ifndef ROCMDIR $(error ROCMDIR not defined.) endif - -ROCM_USEROCTX = -DUSE_NVTX +# Uncomment if willing to use ROCTX capabilities. +# ROCM_USEROCTX = -DUSE_NVTX CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) @@ -18,5 +18,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib -#CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64 From 801115d710904ca505e318e9cd9cc3ffa7fc0f87 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Fri, 9 Sep 2022 09:57:45 -0500 Subject: [PATCH 09/76] Add hipification entries dropped by mistake. --- src/hip/hipify.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 89daad6bc28..7a0300ae02b 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -7,9 +7,12 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // HIP types // #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. #define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT #define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs. #define CUBLAS_OP_C HIPBLAS_OP_C #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_N HIPBLAS_OP_N @@ -146,6 +149,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define curandGenerateUniformDouble hiprandGenerateUniformDouble #define curandGenerator_t hiprandGenerator_t #define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. #define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed #define curandSetStream hiprandSetStream #define curandStatus_t hiprandStatus_t From 081de1ebcc44b846c4953bb3923818d6142b90cc Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 12 Sep 2022 06:06:19 -0500 Subject: [PATCH 10/76] Change IS_GPU_ENABLED to IS_GPU_BUILD in depends build. --- src/makefiles/default_rules.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk index c27b7b0a108..21a3b053639 100644 --- a/src/makefiles/default_rules.mk +++ b/src/makefiles/default_rules.mk @@ -145,7 +145,7 @@ ifneq ($(CC_SRCS),) CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS) endif -ifeq ($(IS_GPU_ENABLED), true) +ifeq ($(IS_GPU_BUILD), true) CUDA_SRCS=$(wildcard *.cu) # Check if any CUDA .cu sources exist to run dependency commands on. ifneq ($(CUDA_SRCS),) From 00098bf097ca7e9e804562c937b20c6714adf2f8 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 12 Sep 2022 17:11:35 -0500 Subject: [PATCH 11/76] Add build logic for ROCm < 5.2.0. --- src/configure | 28 +++++++++++++++++++++------- src/hip/hipify.h | 21 +++++++++++++++++++++ src/makefiles/hip_64bit.mk | 17 ++++++++++++----- 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/configure b/src/configure index fa0b77373a0..ffb87abe106 100755 --- a/src/configure +++ b/src/configure @@ -259,7 +259,7 @@ function configure_rocm { # Check for ROCM in the system if [ ! -d "$ROCMDIR" ]; then for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do - if [ -f $base/bin/hipcc ]; then + if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then ROCMDIR=$base break fi @@ -268,7 +268,7 @@ function configure_rocm { if [ -d "$ROCMDIR" ]; then if [ ! -f $ROCMDIR/bin/hipcc ]; then - failure "Cannnot find hipcc in ROCm directory $ROCMDIR" + failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR" fi fi echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)" @@ -289,7 +289,20 @@ function configure_rocm { echo "HOST_ARCH = `uname -m`" >> kaldi.mk echo >> kaldi.mk - + + ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1) + echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk + ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2) + echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk + + # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use + # __HIP_PLATFORM_AMD__ others __HIP_PLATFORM_HCC__ + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then + echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk + else + echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk + fi + # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, # use direct calls to uname -m here if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then @@ -300,10 +313,11 @@ WARNING: ROCM will not be used! ROCM is only supported with 64-bit Linux builds." exit 1; fi - - #add cusolver flags for newer toolkits - if [ "$CUSOLVER" == "true" ]; then - echo "ROCM_LDLIBS += -lcusolver" >> kaldi.mk + + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then + echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk + else + echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk fi } diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 7a0300ae02b..bdefa9cc4dd 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -3,6 +3,20 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} + +#undef hipLaunchKernelGGLInternal +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM +#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ + do { \ + kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__); \ + } while (0) +#else +#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ + do { \ + kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__); \ + } while (0) +#endif + // // HIP types // @@ -153,10 +167,17 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed #define curandSetStream hiprandSetStream #define curandStatus_t hiprandStatus_t +#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5 #define cusolverDnCreate hipsolverDnCreate #define cusolverDnDestroy hipsolverDnDestroy #define cusolverDnHandle_t hipsolverDnHandle_t #define cusolverDnSetStream hipsolverDnSetStream +#else +#define cusolverDnCreate hipsolverCreate +#define cusolverDnDestroy hipsolverDestroy +#define cusolverDnHandle_t hipsolverHandle_t +#define cusolverDnSetStream hipsolverSetStream +#endif #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 0c558a770d6..3976624032d 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -8,13 +8,20 @@ endif # Uncomment if willing to use ROCTX capabilities. # ROCM_USEROCTX = -DUSE_NVTX -CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 -D__IS_HIP_COMPILE__=1 -D__HIP_PLATFORM_AMD__=1 -DCUDA_VERSION=11000 \ - -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) +# Specific HIP/ROCm components should be included prior to the generic include to avoid +# deprecation warnings. +CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ + -D__IS_HIP_COMPILE__=1 \ + -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ + -DCUDA_VERSION=11000 \ + -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) -ROCM_INCLUDE= -I$(ROCMDIR)/include -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I.. -I../hip -isystem $(OPENFSTINC) +ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC) ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ - -D__IS_HIP_COMPILE__=1 -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ - -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -fgpu-default-stream=per-thread + -D__IS_HIP_COMPILE__=1 \ + -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ + -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib From 9b8dffb3a594293fbf4286233df610ae6041b284 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 12 Sep 2022 17:33:16 -0500 Subject: [PATCH 12/76] Complete ROCm 5.0.2 build with no per-thread streams yet. --- src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.h | 6 +++--- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 8 ++++---- src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/makefiles/hip_64bit.mk | 2 +- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index abd08a9b015..3b47ee525eb 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 1ed7e54b541..09ba2c9aa13 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index fd17fe61893..309d68fccf7 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index da7c57bde36..99165cc592f 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,10 +32,10 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#include +#include #include -#include -#include +#include +#include #include #include "hipify.h" #else diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index e42c93f1b67..dfcbf41d131 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 705bfbeee59..c073ab358ea 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index d7edf5a5a1c..1311668ec33 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,11 @@ #include #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include -#include -#include +#include +#include #include "hipify.h" #else #include @@ -44,7 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#include +#include #else #include #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c1d72ede87e..96c1ef14ed4 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index c9d686d0ce8..8a5865f71af 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a6c7d7720e4..fabd06c9b16 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index a21e5163701..3853ffa7e45 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 378cc8e4e38..dd3a333c9a5 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index cf13d631a0d..cc6332ba48c 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 3976624032d..160f5fb5c0f 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -10,7 +10,7 @@ endif # Specific HIP/ROCm components should be included prior to the generic include to avoid # deprecation warnings. -CXXFLAGS += -Werror $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ +CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ -DCUDA_VERSION=11000 \ From e84d8f072496c9427e804f8189854da9ff49c04b Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 13 Sep 2022 07:44:43 -0500 Subject: [PATCH 13/76] Add cudadecoder support for ROCm 5.2.x. --- src/chain/Makefile | 16 ++++-- src/configure | 9 +++- src/cudadecoder/Makefile | 22 +++++++- .../batched-static-nnet3-kernels.cu | 5 ++ .../batched-static-nnet3-kernels.h | 5 ++ ...hed-threaded-nnet3-cuda-online-pipeline.cc | 5 ++ .../batched-threaded-nnet3-cuda-pipeline.cc | 5 ++ .../batched-threaded-nnet3-cuda-pipeline2.cc | 5 ++ src/cudadecoder/cuda-decoder-kernels-utils.h | 4 +- src/cudadecoder/cuda-decoder-kernels.cu | 6 +++ src/cudadecoder/cuda-decoder.cc | 24 +++++---- src/cudadecoder/cuda-decoder.h | 5 ++ src/cudadecoder/cuda-fst.cc | 6 +++ src/cudadecoderbin/Makefile | 4 +- .../batched-wav-nnet3-cuda-online.cc | 6 +++ src/cudadecoderbin/batched-wav-nnet3-cuda.cc | 6 +++ src/cudadecoderbin/batched-wav-nnet3-cuda2.cc | 7 +++ src/cudafeat/Makefile | 23 +++++++- ...eature-online-batched-cmvn-cuda-kernels.cu | 5 ++ ...ure-online-batched-ivector-cuda-kernels.cu | 5 ++ .../feature-online-batched-ivector-cuda.cc | 16 ++++++ ...re-online-batched-spectral-cuda-kernels.cu | 6 +++ .../feature-online-batched-spectral-cuda.h | 5 ++ src/cudafeat/feature-online-cmvn-cuda.cu | 8 +++ src/cudafeat/feature-spectral-cuda.cu | 6 +++ src/cudafeat/feature-spectral-cuda.h | 5 ++ src/cudafeat/feature-window-cuda.cu | 5 ++ .../online-batched-feature-pipeline-cuda.cc | 7 ++- .../online-batched-feature-pipeline-cuda.h | 4 ++ .../online-ivector-feature-cuda-kernels.cu | 6 +++ src/cudafeat/online-ivector-feature-cuda.cc | 14 ++++- src/cudamatrix/Makefile | 16 ++++-- src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 2 +- src/cudamatrix/cu-block-matrix.cc | 2 +- src/cudamatrix/cu-common.h | 6 +-- src/cudamatrix/cu-compressed-matrix.cc | 2 +- src/cudamatrix/cu-device.cc | 2 +- src/cudamatrix/cu-device.h | 8 +-- src/cudamatrix/cu-kernels.cu | 1 + src/cudamatrix/cu-matrix.cc | 2 +- src/cudamatrix/cu-packed-matrix.cc | 2 +- src/cudamatrix/cu-sp-matrix.cc | 2 +- src/cudamatrix/cu-sparse-matrix.cc | 2 +- src/cudamatrix/cu-tp-matrix.cc | 2 +- src/cudamatrix/cu-vector.cc | 2 +- src/hip/hipify.h | 54 ++++++++++++++----- src/makefiles/hip_64bit.mk | 18 +++++-- 48 files changed, 318 insertions(+), 62 deletions(-) diff --git a/src/chain/Makefile b/src/chain/Makefile index 678bb03ef33..5cc8d8901a1 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -33,13 +33,21 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ endif ifeq ($(ROCM), true) -#%.hip : %.cu -# $(HIPIFY) $< 1> $@ 2> $@.stats -#%.o : %.hip -# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif +endif include ../makefiles/default_rules.mk diff --git a/src/configure b/src/configure index ffb87abe106..ca3df9563ab 100755 --- a/src/configure +++ b/src/configure @@ -316,8 +316,9 @@ WARNING: ROCM will not be used! if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk + echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk else - echo "ROCM_FLAGS += -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1" >> kaldi.mk + echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk fi } @@ -1055,7 +1056,11 @@ if $use_cuda; then fi echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk else - echo "WITH_CUDADECODER = false" >> kaldi.mk + if $use_rocm; then + echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk + else + echo "WITH_CUDADECODER = false" >> kaldi.mk + fi fi echo >> kaldi.mk diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index e2569e89ab7..062e9a47d41 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -3,13 +3,15 @@ all: ; EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif TESTFILES = @@ -34,8 +36,26 @@ LDLIBS += $(CUDA_LDLIBS) # Implicit rule for kernel compilation +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) +endif +ifeq ($(ROCM), true) +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +else +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +endif +endif else all: diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu index f02a78ed1af..429d9f72326 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.cu +++ b/src/cudadecoder/batched-static-nnet3-kernels.cu @@ -17,6 +17,11 @@ #include "cudadecoder/batched-static-nnet3-kernels.h" +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "hipify.h" +#endif + #include namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h index 45064e15071..0bcb1997576 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.h +++ b/src/cudadecoder/batched-static-nnet3-kernels.h @@ -17,7 +17,12 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "base/kaldi-types.h" #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_ diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc index 6e78d7212fd..c7012b686e0 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc @@ -21,7 +21,12 @@ #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc index 89e93e5d98c..d5cf7dae2d7 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc @@ -26,7 +26,12 @@ #include +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "base/kaldi-utils.h" #include "cudadecoder/cuda-fst.h" diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc index c076910672a..f6a3455db01 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc @@ -23,7 +23,12 @@ #include +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h index fc0d2cddd2c..add66312817 100644 --- a/src/cudadecoder/cuda-decoder-kernels-utils.h +++ b/src/cudadecoder/cuda-decoder-kernels-utils.h @@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) { value.i2 = val; if (old.i2.x <= val.x) return; do { - assumed = old; + assumed.ull = old.ull; old.ull = atomicCAS(ptr64, assumed.ull, value.ull); } while (old.ull != assumed.ull && old.i2.x > value.i2.x); } @@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) { UInt64UnionInt2 old, assumed, value; old.ull = *ptr64; do { - assumed = old; + assumed.ull = old.ull; value.i2.x = assumed.i2.x - sub.x; value.i2.y = assumed.i2.y - sub.y; old.ull = atomicCAS(ptr64, assumed.ull, value.ull); diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index 3a835d02b76..6a14371911d 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -15,7 +15,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include "float.h" +#include +#include "hipify.h" +#else #include +#endif #include "cuda-decoder-kernels.h" #include "cuda-decoder-kernels-utils.h" diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 1ec456ac32c..06dceae73a5 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -37,8 +37,14 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif #include "base/kaldi-utils.h" #include "cudadecoder/cuda-decoder-kernels.h" @@ -184,35 +190,35 @@ void CudaDecoder::AllocateDeviceData() { void CudaDecoder::AllocateHostData() { channel_to_compute_.resize(nlanes_); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_and_acoustic_cost_concat_, + (void**)&h_extra_and_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_acoustic_cost_concat_, + (void**)&h_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_prev_tokens_concat_, + (void**)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_infotoken_concat_, + (void**)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( - cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_, + cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_acoustic_cost_concat_tmp_, + (void**)&h_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_prev_tokens_concat_tmp_, + (void**)&h_extra_prev_tokens_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_infotoken_concat_tmp_, + (void**)&h_infotoken_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_))); h_lanes_counters_.Resize( nlanes_ + 1, 1); // +1 because we sometimes need last+1 value (for offsets) KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); + (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_); h_all_tokens_acoustic_cost_.resize(nchannels_); diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h index de2bd09f47c..510904aa004 100644 --- a/src/cudadecoder/cuda-decoder.h +++ b/src/cudadecoder/cuda-decoder.h @@ -20,7 +20,12 @@ #if HAVE_CUDA +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include #include diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc index 56066ee069d..3af37eb7676 100644 --- a/src/cudadecoder/cuda-fst.cc +++ b/src/cudadecoder/cuda-fst.cc @@ -22,8 +22,14 @@ #include "cudadecoder/cuda-fst.h" #include "cudamatrix/cu-common.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile index 1f093299eb4..96b00c06101 100644 --- a/src/cudadecoderbin/Makefile +++ b/src/cudadecoderbin/Makefile @@ -2,13 +2,15 @@ all: ; include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc index 1aba7144af1..56368853df2 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc @@ -23,9 +23,15 @@ #error CUDA support must be configured to compile this binary. #endif +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "roctracer/roctx.h" +#include "hipify.h" +#else #include #include #include +#endif #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc index 46138116bd8..05af50d7a3b 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc @@ -17,9 +17,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "roctracer/roctx.h" +#include "hipify.h" +#else #include #include #include +#endif #include #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h" #include "cudamatrix/cu-allocator.h" diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc index 992b34598d2..c14571f2ed9 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc @@ -18,9 +18,16 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include "hipify.h" +#else #include #include #include +#endif #include diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index 54bcc53af1e..c3a4489e18e 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -2,13 +2,15 @@ all: ; include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif TESTFILES = @@ -37,9 +39,26 @@ LDLIBS += $(CUDA_LDLIBS) # Implicit rule for kernel compilation +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) - +endif +ifeq ($(ROCM), true) +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +else +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +endif +endif else all: $(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]") diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu index c839548d6eb..09b0caff255 100644 --- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu @@ -15,7 +15,12 @@ // See the License for the specific language governing permissions and // limitations under the License. // +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h" __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index 0b57d6a32ea..0b4cfce812c 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -16,7 +16,12 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h" #include "cudamatrix/cu-common.h" namespace kaldi { diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 538e268dd98..6d68c93f917 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -15,6 +15,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include "hipify.h" +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched +// The BLAS enumerators are used instead of the SOLVER ones. +#ifdef CUBLAS_FILL_MODE_LOWER +#undef CUBLAS_FILL_MODE_LOWER +#endif +#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#ifdef CUDA_R_32F +#undef CUDA_R_32F +#endif +#define CUDA_R_32F HIPBLAS_R_32F +#endif + #include "cudafeat/feature-online-batched-ivector-cuda.h" #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h" diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index c43adaccc2e..f847311d755 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -17,8 +17,14 @@ #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif #include "cudafeat/lane-desc.h" #include "cudamatrix/cu-rand.h" diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index e4549c7177c..113657ce317 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -19,8 +19,13 @@ #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include #endif +#endif #include "cudafeat/feature-spectral-cuda.h" #include "cudafeat/feature-window-cuda.h" diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index ba13b4fe484..8d4648d04bb 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -15,11 +15,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif + #include "cudafeat/feature-online-cmvn-cuda.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-vector.h" +#ifndef __IS_HIP_COMPILE__ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { float2 retval; retval.x = a.x - b.x; @@ -32,6 +39,7 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { retval.y = a.y + b.y; return retval; } +#endif #if __CUDA_ARCH__ == 750 __launch_bounds__ (1024, 1) diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index 3912661c4fd..c320c85a029 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -17,8 +17,14 @@ #include "cudafeat/feature-spectral-cuda.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include "hipify.h" +#else #include #include +#endif #include "cudamatrix/cu-rand.h" diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 8683372098c..5625592a717 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -19,8 +19,13 @@ #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include #endif +#endif #include "cudafeat/feature-window-cuda.h" #include "cudamatrix/cu-matrix.h" diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu index b8db5bd46d3..6ba45e682c1 100644 --- a/src/cudafeat/feature-window-cuda.cu +++ b/src/cudafeat/feature-window-cuda.cu @@ -17,7 +17,12 @@ #include "cudafeat/feature-window-cuda.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif #include "matrix/matrix-functions.h" diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc index 981345404f5..650b51ec3c7 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc @@ -20,7 +20,12 @@ #include "cudafeat/online-batched-feature-pipeline-cuda.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif namespace kaldi { @@ -95,7 +100,7 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda( current_samples_stash_ = new int32_t[num_channels_]; // allocated pinned memory for storing channel desc - CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_)); + CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); // allocate device memory lanes_ = diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h index fa000f03b62..6c588c40c24 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.h +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h @@ -23,6 +23,10 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include "hipify.h" +#endif + #include "base/kaldi-error.h" #include "feat/feature-window.h" #include "matrix/matrix-lib.h" diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index 12d9b071f59..378ea18e689 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -15,7 +15,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +#else #include +#endif + #include "cudafeat/online-ivector-feature-cuda-kernels.h" #include "cudamatrix/cu-common.h" namespace kaldi { diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index bd4964860e0..c3b15d72a5b 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -16,8 +16,19 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include "hipify.h" +// The BLAS enumerators are used instead of the SOLVER ones. +#ifdef CUBLAS_FILL_MODE_LOWER +#undef CUBLAS_FILL_MODE_LOWER +#endif +#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#else #include #endif +#endif + #include #include "base/io-funcs.h" @@ -288,13 +299,14 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( // Forming new non-SP matrix for cusolver. CuMatrix A(quadratic); + + #ifdef CHOLESKY // query temp buffer size int L_work; CUSOLVER_SAFE_CALL( cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), A.Data(), A.Stride(), &L_work)); - // allocate temp buffer float *workspace = static_cast( CuDevice::Instantiate().Malloc(L_work * sizeof(float))); diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 512028c6c13..5cd4adcffd8 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -34,12 +34,20 @@ ifeq ($(CUDA), true) endif ifeq ($(ROCM), true) -#%.hip : %.cu -# $(HIPIFY) $< 1> $@ 2> $@.stats -#%.o : %.hip -# $(HIPCC) -c $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) +.PRECIOUS: %.hip +%.hip : %.cu + LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + cat $< | \ + sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ + sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ + cat > $@ +%.o : %.hip + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif +endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index 3b47ee525eb..abd08a9b015 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 09ba2c9aa13..1ed7e54b541 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 309d68fccf7..fd17fe61893 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 99165cc592f..da7c57bde36 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,10 +32,10 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#include +#include #include -#include -#include +#include +#include #include #include "hipify.h" #else diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index dfcbf41d131..e42c93f1b67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index c073ab358ea..705bfbeee59 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 1311668ec33..d7edf5a5a1c 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,11 @@ #include #ifdef __IS_HIP_COMPILE__ -#include +#include #include #include -#include -#include +#include +#include #include "hipify.h" #else #include @@ -44,7 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#include +#include #else #include #endif diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 9a99f19b58f..1d6e0664541 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -23,6 +23,7 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. + // In this file is the CUDA code of the CUDA kernels, plus the ANSI-C wrappers #include diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 96c1ef14ed4..c1d72ede87e 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 8a5865f71af..c9d686d0ce8 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index fabd06c9b16..a6c7d7720e4 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 3853ffa7e45..a21e5163701 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index dd3a333c9a5..378cc8e4e38 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index cc6332ba48c..cf13d631a0d 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#include +#include #include "hipify.h" #else #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index bdefa9cc4dd..24b5f2f8eb3 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -1,29 +1,22 @@ #ifndef __HIPIFY_H__ #define __HIPIFY_H__ +#ifdef __HIPCC__ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} - - -#undef hipLaunchKernelGGLInternal -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM -#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ - do { \ - kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamPerThread) : (streamId) )>>>(__VA_ARGS__); \ - } while (0) +// AMDGCN only support this rounding mode. +#define __fdiv_rd __fdiv_rn #else -#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ - do { \ - kernelName<<<(numBlocks), (numThreads), (memPerBlock), ( (streamId == 0) ? (hipStreamDefault) : (streamId) )>>>(__VA_ARGS__); \ - } while (0) +#define __align__(x) __attribute__((aligned (x))) #endif // // HIP types // #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. +#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. #define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER #define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs. @@ -46,6 +39,8 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS #define CUDA_R_32F HIP_R_32F #define CUDA_R_64F HIP_R_64F +#define CUFFT_R2C HIPFFT_R2C +#define CUFFT_SUCCESS HIPFFT_SUCCESS #define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT #define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED #define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH @@ -104,6 +99,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cublasGemmAlgo_t hipblasGemmAlgo_t #define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx #define cublasHandle_t hipblasHandle_t #define cublasOperation_t hipblasOperation_t #define cublasSasum_v2 hipblasSasum @@ -133,15 +129,29 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse #define cudaErrorInvalidDevice hipErrorInvalidDevice #define cudaError_t hipError_t +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t #define cudaFree hipFree +#define cudaFreeHost hipFreeHost #define cudaGetDevice hipGetDevice #define cudaGetDeviceCount hipGetDeviceCount #define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorName hipGetErrorName #define cudaGetErrorString hipGetErrorString #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterDefault hipHostRegisterDefault +#define cudaHostUnregister hipHostUnregister #define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc #define cudaMallocPitch hipMallocPitch +#define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice @@ -150,11 +160,20 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaMemset2DAsync hipMemset2DAsync #define cudaMemsetAsync hipMemsetAsync #define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy #define cudaStreamLegacy ((hipStream_t)1) #define cudaStreamPerThread ((hipStream_t)2) #define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess +#define cufftComplex hipfftComplex +#define cufftDestroy hipfftDestroy +#define cufftExecR2C hipfftExecR2C +#define cufftHandle hipfftHandle +#define cufftPlanMany hipfftPlanMany +#define cufftSetStream hipfftSetStream #define curandCreateGenerator hiprandCreateGenerator #define curandDestroyGenerator hiprandDestroyGenerator #define curandGenerateNormal hiprandGenerateNormal @@ -178,6 +197,11 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusolverDnHandle_t hipsolverHandle_t #define cusolverDnSetStream hipsolverSetStream #endif +#define cusolverDnSpotrf hipsolverDnSpotrf +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverDnSpotrs +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr @@ -201,7 +225,9 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize #define cusparseSpMatDescr_t hipsparseSpMatDescr_t #define cusparseStatus_t hipsparseStatus_t - +#define nvtxRangePop roctxRangePop +#define nvtxRangePush roctxRangePush +#define nvtxRangePushA roctxRangePushA // // HIPCUB namespace. // diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 160f5fb5c0f..e2f43ecd55c 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -14,9 +14,21 @@ CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ -DCUDA_VERSION=11000 \ - -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -fPIC -pthread -isystem $(OPENFSTINC) + -I$(ROCMDIR)/hipsparse/include \ + -I$(ROCMDIR)/hipfft/include \ + -I$(ROCMDIR)/hipblas/include \ + -I$(ROCMDIR)/hiprand/include \ + -I$(ROCMDIR)/rocrand/include \ + -I$(ROCMDIR)/include \ + -I.. -I../hip -fPIC -pthread -isystem $(OPENFSTINC) -ROCM_INCLUDE = -I$(ROCMDIR)/hiprand/include -I$(ROCMDIR)/rocrand/include -I$(ROCMDIR)/include -I../hip -isystem $(OPENFSTINC) +ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \ + -I$(ROCMDIR)/hipfft/include \ + -I$(ROCMDIR)/hipblas/include \ + -I$(ROCMDIR)/hiprand/include \ + -I$(ROCMDIR)/rocrand/include \ + -I$(ROCMDIR)/include \ + -I.. -I../hip -isystem $(OPENFSTINC) ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ @@ -25,4 +37,4 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ #TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib -CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lamdhip64 +CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 From aed0ce594e72bc935ab1f2fade0f26aa5229a3b9 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 13 Sep 2022 11:44:33 -0500 Subject: [PATCH 14/76] Complete support for ROCm 5.0.2. --- src/chain/Makefile | 2 +- src/cudadecoder/Makefile | 2 +- src/cudafeat/Makefile | 2 +- .../feature-online-batched-ivector-cuda.cc | 41 +++++++++++++++++-- .../feature-online-batched-spectral-cuda.h | 4 ++ src/cudafeat/feature-spectral-cuda.h | 4 ++ src/cudafeat/online-ivector-feature-cuda.cc | 17 ++++++++ src/cudamatrix/Makefile | 2 +- src/cudamatrix/cu-allocator.cc | 4 ++ src/cudamatrix/cu-allocator.h | 4 ++ src/cudamatrix/cu-block-matrix.cc | 4 ++ src/cudamatrix/cu-common.h | 7 +++- src/cudamatrix/cu-compressed-matrix.cc | 4 ++ src/cudamatrix/cu-device.cc | 5 ++- src/cudamatrix/cu-device.h | 11 ++++- src/cudamatrix/cu-matrix.cc | 4 ++ src/cudamatrix/cu-packed-matrix.cc | 4 ++ src/cudamatrix/cu-sp-matrix.cc | 4 ++ src/cudamatrix/cu-sparse-matrix.cc | 4 ++ src/cudamatrix/cu-tp-matrix.cc | 4 ++ src/cudamatrix/cu-vector.cc | 4 ++ src/hip/hipify.h | 16 +++++--- 22 files changed, 138 insertions(+), 15 deletions(-) diff --git a/src/chain/Makefile b/src/chain/Makefile index 5cc8d8901a1..5b177981ad8 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -36,7 +36,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index 062e9a47d41..d4eda345564 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -44,7 +44,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index c3a4489e18e..c0f54a854e8 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -47,7 +47,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 6d68c93f917..68c247b43e9 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -17,9 +17,6 @@ #ifdef __IS_HIP_COMPILE__ #include "hipify.h" -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched -#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched // The BLAS enumerators are used instead of the SOLVER ones. #ifdef CUBLAS_FILL_MODE_LOWER #undef CUBLAS_FILL_MODE_LOWER @@ -385,6 +382,43 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( #if CUDA_VERSION >= 9010 int nrhs = 1; + +#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) + // query temp buffer size + int L_work; + + // perform factorization in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, + ivector_dim_, &L_work, num_lanes)); + // allocate temp buffer + float *workspace = static_cast( + CuDevice::Instantiate().Malloc(L_work * sizeof(float))); + + // perform factorization in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, + ivector_dim_, workspace, L_work, d_infoArray_, num_lanes)); + + int L_work2; + + // perform factorization in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, + quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes)); + // allocate temp buffer + float *workspace2 = static_cast( + CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); + + // solve for rhs in batched + CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, + quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_, + num_lanes)); + + CuDevice::Instantiate().Free(workspace); + CuDevice::Instantiate().Free(workspace2); +#else // perform factorization in batched CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, @@ -395,6 +429,7 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_, num_lanes)); +#endif #endif // cusolver solves in place. Ivectors are now in linear_ diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index 113657ce317..202232c6b23 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -20,7 +20,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 5625592a717..66f0dce395a 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -20,7 +20,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index c3b15d72a5b..56dbac93165 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -317,9 +317,26 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( A.Stride(), workspace, L_work, d_info_)); // solve for rhs +#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) + // query temp buffer size + int L_work2; + CUSOLVER_SAFE_CALL( + hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, + A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2)); + // allocate temp buffer + float *workspace2 = static_cast( + CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); + + CUSOLVER_SAFE_CALL(hipsolverSpotrs( + GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, + A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_)); + + CuDevice::Instantiate().Free(workspace2); +#else CUSOLVER_SAFE_CALL(cusolverDnSpotrs( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_)); +#endif CuDevice::Instantiate().Free(workspace); #else diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 5cd4adcffd8..3c1100753e5 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -37,7 +37,7 @@ ifeq ($(ROCM), true) ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) .PRECIOUS: %.hip %.hip : %.cu - LA='[^\(]+\([^\)]+\)|[^,]+' ; \ + LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ cat $< | \ sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index abd08a9b015..d81dca002ce 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,7 +25,11 @@ #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 1ed7e54b541..f776bbb620e 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,7 +24,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index fd17fe61893..7983cd250e7 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index da7c57bde36..c4bdf569d3c 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,10 +32,15 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#include +#else #include +#include +#endif #include #include -#include #include #include "hipify.h" #else diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index e42c93f1b67..442d2dbac67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 705bfbeee59..3dada172ba8 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -21,10 +21,13 @@ // limitations under the License. - #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index d7edf5a5a1c..67b9f1d9e9b 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,16 @@ #include #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#include +#else #include +#include +#endif #include #include #include -#include #include "hipify.h" #else #include @@ -44,7 +49,11 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #else #include #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c1d72ede87e..9897917a33f 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,7 +29,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index c9d686d0ce8..4de0fcba63d 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,7 +23,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a6c7d7720e4..86a3cd9a726 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index a21e5163701..93d10099466 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,7 +24,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 378cc8e4e38..739bab3dd59 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,7 +21,11 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index cf13d631a0d..1deb1cb8733 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,7 +24,11 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include +#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 +#include +#else #include +#endif #include "hipify.h" #else #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 24b5f2f8eb3..b631ac08a23 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -191,17 +191,22 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cusolverDnDestroy hipsolverDnDestroy #define cusolverDnHandle_t hipsolverDnHandle_t #define cusolverDnSetStream hipsolverDnSetStream +#define cusolverDnSpotrf hipsolverDnSpotrf +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverDnSpotrs +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched #else #define cusolverDnCreate hipsolverCreate #define cusolverDnDestroy hipsolverDestroy #define cusolverDnHandle_t hipsolverHandle_t #define cusolverDnSetStream hipsolverSetStream +#define cusolverDnSpotrf hipsolverSpotrf +#define cusolverDnSpotrfBatched hipsolverSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverSpotrs +#define cusolverDnSpotrsBatched hipsolverSpotrsBatched #endif -#define cusolverDnSpotrf hipsolverDnSpotrf -#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched -#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize -#define cusolverDnSpotrs hipsolverDnSpotrs -#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr @@ -235,3 +240,4 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #endif //__HIPIFY_H__ + From 99101e8de70f17e670266f578638fe14e7785dce Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Tue, 6 Dec 2022 19:31:39 +0300 Subject: [PATCH 15/76] Do not use ADL to invoke std::binary_search --- src/tree/build-tree.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc index 534f3352def..9726b5343ee 100644 --- a/src/tree/build-tree.cc +++ b/src/tree/build-tree.cc @@ -675,7 +675,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats, for (int32 i = 0; static_cast(i) < summed_stats.size(); i++) { // A check. if (summed_stats[i] != NULL && - !binary_search(phones.begin(), phones.end(), i)) { + !std::binary_search(phones.begin(), phones.end(), i)) { KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this]."; } } @@ -795,7 +795,7 @@ void KMeansClusterPhones(BuildTreeStatsType &stats, for (int32 i = 0; static_cast(i) < summed_stats.size(); i++) { // just a check. if (summed_stats[i] != NULL && - !binary_search(phones.begin(), phones.end(), i)) { + !std::binary_search(phones.begin(), phones.end(), i)) { KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this]."; } } From a023f3fe56c0c43f7be5a6086f0cc1067aeebeae Mon Sep 17 00:00:00 2001 From: daanzu Date: Sun, 11 Dec 2022 06:54:17 -0500 Subject: [PATCH 16/76] SRILM: allow bypassing download/extraction during automated installation --- tools/extras/install_srilm.sh | 47 +++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh index 813109dbb80..fa4b7b7ed80 100755 --- a/tools/extras/install_srilm.sh +++ b/tools/extras/install_srilm.sh @@ -16,30 +16,41 @@ fi ! command -v gawk > /dev/null && \ echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; -if [ $# -ne 3 ]; then - echo "SRILM download requires some information about you" - echo - echo "Usage: $0 " - exit 1 -fi - -srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php" -post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3" - -if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 +if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ] && [ ! -d srilm ]; then + if [ $# -ne 3 ]; then + echo "SRILM download requires some information about you" + echo + echo "Usage: $0 " + exit 1 + fi + + srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php" + post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3" + + if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then + echo 'There was a problem downloading the file.' + echo 'Check your internet connection and try again.' + exit 1 + fi + + if [ ! -s srilm.tar.gz ]; then + echo 'The file is empty. There was a problem downloading the file.' + exit 1 + fi fi mkdir -p srilm cd srilm - if [ -f ../srilm.tgz ]; then - tar -xvzf ../srilm.tgz # Old SRILM format -elif [ -f ../srilm.tar.gz ]; then - tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz + tar -xvzf ../srilm.tgz || exit 1 # Old SRILM format +elif [ -f ../srilm.tar.gz ]; then + tar -xvzf ../srilm.tar.gz || exit 1 # Changed format type from tgz to tar.gz +fi + +if [ ! -f RELEASE ]; then + echo 'The file RELEASE does not exist. There was a problem extracting.' + exit 1 fi major=`gawk -F. '{ print $1 }' RELEASE` From be22248e3a166d9ec52c78dac945f471e7c3a8aa Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Tue, 13 Dec 2022 02:29:16 -0800 Subject: [PATCH 17/76] [src] Make word alignment optional (#4802) * Remove unused variable. * cudadecoder: Make word alignment optional. For CTC models using word pieces or graphemes, there is not enough positional information to use the word alignment. I tried marking every unit as "singleton" word_boundary.txt, but this explodes the state space very, very often. See: https://github.com/nvidia-riva/riva-asrlib-decoder/issues/3 With the "_" character in CTC models predicting word pieces, we at the very least know which word pieces begin a word and which ones are either in the middle of the word or the end of a word, but the algorithm would still need to be rewritten, especially since "blank" is not a silence phoneme (it can appear between). I did look into using the lexicon-based word alignment. I don't have a specific complaint about it, but I did get a weird error where it couldn't create a final state at all in the output lattice, which caused Connect() to output an empty lattice. This may be because I wasn't quite sure how to handle the blank token. I treat it as its own phoneme, bcause of limitations in TransitionInformation, but this doesn't really make any sense. Needless to say, while the CTM outputs of the cuda decoder will be correct from a WER point of view, their time stamps won't be correct, but they probably never were in the first place, for CTC models. --- src/cudadecoder/lattice-postprocessor.cc | 15 ++++++++------- src/fstext/pre-determinize-inl.h | 2 -- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/cudadecoder/lattice-postprocessor.cc b/src/cudadecoder/lattice-postprocessor.cc index 46d44216890..49f96191787 100644 --- a/src/cudadecoder/lattice-postprocessor.cc +++ b/src/cudadecoder/lattice-postprocessor.cc @@ -78,13 +78,14 @@ bool LatticePostprocessor::GetPostprocessedLattice( KALDI_ASSERT(decoder_frame_shift_ != 0.0 && "SetDecoderFrameShift() must be called (typically by pipeline)"); - if (!word_info_) - KALDI_ERR << "You must set --word-boundary-rxfilename in the lattice " - "postprocessor config"; - // ok &= - // Ignoring the return false for now (but will print a warning), - // because the doc says we can, and it can happen when using endpointing - WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat); + if (word_info_) { + // ok &= + // Ignoring the return false for now (but will print a warning), + // because the doc says we can, and it can happen when using endpointing + WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat); + } else { + *out_clat = clat; + } return ok; } diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h index d51948f1877..b67b0ba6fa6 100644 --- a/src/fstext/pre-determinize-inl.h +++ b/src/fstext/pre-determinize-inl.h @@ -689,11 +689,9 @@ typename Arc::StateId CreateSuperFinal(MutableFst *fst) { typedef typename Arc::Weight Weight; assert(fst != NULL); StateId num_states = fst->NumStates(); - StateId num_final = 0; std::vector final_states; for (StateId s = 0; s < num_states; s++) { if (fst->Final(s) != Weight::Zero()) { - num_final++; final_states.push_back(s); } } From aa17817f53ea44e44275bc494e747baaccc2e4d2 Mon Sep 17 00:00:00 2001 From: Tanmay Jain Date: Mon, 26 Dec 2022 20:37:51 +0530 Subject: [PATCH 18/76] Fix variable name (#4815) Fix "glossaries_opt" variable name at line number 39. It's misspelled due to which words in the glossaries weren't reserved while creating BPE. --- egs/wsj/s5/utils/subword/prepare_subword_text.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh index aa0163235a6..2a5750c9238 100755 --- a/egs/wsj/s5/utils/subword/prepare_subword_text.sh +++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh @@ -36,7 +36,7 @@ grep -q $separator $word_text && echo "$0: Error, word text file contains separa glossaries_opt= [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries" cut -d ' ' -f2- $word_text | \ - utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub + utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaries_opt > ${word_text}.sub if [ $word_text == $subword_text ]; then mv $word_text ${word_text}.old cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text From 0785b66521d9732a0b2916e601830d751089f360 Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Thu, 5 Jan 2023 10:11:12 -0800 Subject: [PATCH 19/76] Add support for CUDA 12 and Hopper. --- src/configure | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/configure b/src/configure index ed627eceedc..95338ea1bd0 100755 --- a/src/configure +++ b/src/configure @@ -283,6 +283,7 @@ Either your CUDA is too new or too old." GCC_VER=$($CXX -dumpversion) GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d") case $CUDA_VERSION in + # Update this list by consulting https://gist.github.com/ax3l/9489132 # Disabling CUDA 7 and CUDA 8 because we now use C++14 to compile CUDA # code. It is still possible to use those cuda versions by switching # back to C++11 in src/makefiles/cuda_64bit.mk and use CUB <= 1.8.0. @@ -317,7 +318,13 @@ Either your CUDA is too new or too old." 11_*) MIN_UNSUPPORTED_GCC_VER="12.0" MIN_UNSUPPORTED_GCC_VER_NUM=120000 - ;; + CUSOLVER=true + ;; + 12_*) + MIN_UNSUPPORTED_GCC_VER="12.2" + MIN_UNSUPPORTED_GCC_VER_NUM=122000 + CUSOLVER=true + ;; *) failure "Unsupported CUDA version ${CUDA_VERSION}. Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ @@ -345,6 +352,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ 10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;; 11_0) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80" ;; 11_*) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" ;; + 12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" ;; *) failure \ "Unsupported CUDA version ${CUDA_VERSION}. Please open an" \ "issue at https://github.com/kaldi-asr/kaldi/issues and" \ From ae8cbe8858f2a66a9b193c82dbe3b0479364165f Mon Sep 17 00:00:00 2001 From: Daniel Galvez Date: Tue, 13 Dec 2022 11:03:29 -0800 Subject: [PATCH 20/76] [misc] Install python2.7 This is to fix a CI error. It appears that this is from using "ubuntu-latest" in the CI workflow. It got upgraded to ubuntu 22.04 automatically, and this doesn't have python2.7 by default. --- .github/workflows/c-cpp.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index c1f923cf58a..8a21c82ea8f 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -19,6 +19,8 @@ jobs: - uses: actions/checkout@v3 - name: Install sox run: sudo apt-get install -y sox intel-mkl + - name: Install python2 + run: sudo apt-get install -y python2 - name: ccache uses: hendrikmuhs/ccache-action@v1.2 with: From 8c3c0bca5dfd4dcb45174b0d2744deb246552b2a Mon Sep 17 00:00:00 2001 From: Neimhin Date: Fri, 20 Jan 2023 14:34:10 +0000 Subject: [PATCH 21/76] Update install_srilm.sh --- tools/extras/install_srilm.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh index 813109dbb80..9f305c9310f 100755 --- a/tools/extras/install_srilm.sh +++ b/tools/extras/install_srilm.sh @@ -16,15 +16,15 @@ fi ! command -v gawk > /dev/null && \ echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; -if [ $# -ne 3 ]; then +if [ $# -ne 4 ]; then echo "SRILM download requires some information about you" echo - echo "Usage: $0 " + echo "Usage: $0
" exit 1 fi -srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php" -post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3" +srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download2.php" +post_data="file=1.7.3&name=$1&org=$2&email=$3&address=$4&license=on" if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then echo 'There was a problem downloading the file.' From e4eb4f6725d836b7915230f54adedfb605379254 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Fri, 3 Feb 2023 21:19:56 +0100 Subject: [PATCH 22/76] egs/ami: Fix BUT path to wavs in AMI scripts, add beamformer config (#4820) - the audio data no longer exist in that path - the beamformer config was missing in 'ami/s5b', it's taken from 'ami/s5' --- egs/ami/s5/run_ihm.sh | 2 +- egs/ami/s5/run_mdm.sh | 2 +- egs/ami/s5/run_sdm.sh | 2 +- egs/ami/s5b/cmd.sh | 2 +- egs/ami/s5b/conf/ami_beamformit.cfg | 50 +++++++++++++++++++++++++++++ egs/ami/s5b/run.sh | 2 +- egs/ami/s5c/run.sh | 8 ++--- 7 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 egs/ami/s5b/conf/ami_beamformit.cfg diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh index 0d40d25c23a..ed91a980791 100755 --- a/egs/ami/s5/run_ihm.sh +++ b/egs/ami/s5/run_ihm.sh @@ -17,7 +17,7 @@ set -euxo pipefail # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh index 4389c6b5d81..0cc76a56dd0 100755 --- a/egs/ami/s5/run_mdm.sh +++ b/egs/ami/s5/run_mdm.sh @@ -10,7 +10,7 @@ mic=mdm$nmics # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh index 17e2071f1f6..a212a8846b2 100755 --- a/egs/ami/s5/run_sdm.sh +++ b/egs/ami/s5/run_sdm.sh @@ -17,7 +17,7 @@ set -euxo pipefail # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5b/cmd.sh b/egs/ami/s5b/cmd.sh index b004c5569df..a8ea5d7c1ba 100644 --- a/egs/ami/s5b/cmd.sh +++ b/egs/ami/s5b/cmd.sh @@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G" # the use of cuda_cmd is deprecated, used only in 'nnet1', export cuda_cmd="queue.pl --gpu 1 --mem 20G" -if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then +if [[ "$(hostname -d)" == "fit.vutbr.cz" ]]; then queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" diff --git a/egs/ami/s5b/conf/ami_beamformit.cfg b/egs/ami/s5b/conf/ami_beamformit.cfg new file mode 100644 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/ami/s5b/conf/ami_beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh index 79989f17004..94cd81f230b 100755 --- a/egs/ami/s5b/run.sh +++ b/egs/ami/s5b/run.sh @@ -28,7 +28,7 @@ set -euo pipefail # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5c/run.sh b/egs/ami/s5c/run.sh index cc4cd87610b..1281cad2e43 100755 --- a/egs/ami/s5c/run.sh +++ b/egs/ami/s5c/run.sh @@ -3,7 +3,7 @@ # Apache 2.0. # # This recipe performs diarization for the mix-headset data in the -# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 +# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 # corpus with simulated RIRs. We use oracle SAD in this recipe. # This recipe demonstrates the following: # 1. Diarization using x-vector and clustering (AHC, VBx, spectral) @@ -38,7 +38,7 @@ diarizer_type=spectral # must be one of (ahc, spectral, vbx) # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora5/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac @@ -57,7 +57,7 @@ if [ $stage -le 1 ]; then local/ami_download.sh $mic $AMI_DIR fi -# Prepare data directories. +# Prepare data directories. if [ $stage -le 2 ]; then # Download the data split and references from BUT's AMI setup if ! [ -d AMI-diarization-setup ]; then @@ -120,7 +120,7 @@ if [ $stage -le 6 ]; then transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- |\ ivector-normalize-length ark:- ark:- |" \ $model_dir/xvectors_plda_train/plda || exit 1; - + cp $model_dir/xvectors_plda_train/plda $model_dir/ cp $model_dir/xvectors_plda_train/transform.mat $model_dir/ cp $model_dir/xvectors_plda_train/mean.vec $model_dir/ From ed910d6090e48417a90084d7161023f429fa4e1e Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Sat, 18 Feb 2023 14:15:56 +0300 Subject: [PATCH 23/76] Fix -Wdeprecated-copy from c++11 --- src/fstext/lattice-weight.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h index 7637c4d1c55..6e7737a195d 100644 --- a/src/fstext/lattice-weight.h +++ b/src/fstext/lattice-weight.h @@ -438,11 +438,9 @@ class CompactLatticeWeightTpl { CompactLatticeWeightTpl(const WeightType &w, const std::vector &s): weight_(w), string_(s) { } - CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) { - weight_ = w.weight_; - string_ = w.string_; - return *this; - } + CompactLatticeWeightTpl(const CompactLatticeWeightTpl &compactLatticeWeightTpl) = default; + + CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) = default; const W &Weight() const { return weight_; } From 59299d1cf95b72bb109d583947d9e9ece19aa6dc Mon Sep 17 00:00:00 2001 From: Yifan Yang <64255737+yfyeung@users.noreply.github.com> Date: Mon, 20 Feb 2023 11:36:50 +0800 Subject: [PATCH 24/76] Fix for issue#4801 (#4826) --- cmake/gen_cmake_skeleton.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py index 5925c6369a8..c8fee4c415f 100644 --- a/cmake/gen_cmake_skeleton.py +++ b/cmake/gen_cmake_skeleton.py @@ -269,7 +269,7 @@ def gen_code(self): if len(self.depends) > 0: ret.append("target_link_libraries(" + self.target_name + " PUBLIC") - for d in self.depends: + for d in self.depends + ['-lcblas', '-llapack']: ret.append(" " + d) ret.append(")\n") From ab8fa9e46182c6550d115fb10c7032fedfd6e01a Mon Sep 17 00:00:00 2001 From: "Nickolay V. Shmyrev" Date: Mon, 17 Apr 2023 21:58:12 +0300 Subject: [PATCH 25/76] No need for atomicAdd for float2, conflicts with CUDA 12.1 (#4838) function is not used anyway --- src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu index c839548d6eb..d803a915ea0 100644 --- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu @@ -24,6 +24,7 @@ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { retval.y = a.y - b.y; return retval; } + __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { float2 retval; retval.x = a.x + b.x; @@ -31,11 +32,6 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { return retval; } -__device__ inline void atomicAdd(float2 *addr, float2 val) { - atomicAdd(reinterpret_cast(addr), val.x); - atomicAdd(reinterpret_cast(addr) + 1, val.y); -} - __device__ inline void operator+=(float2 &a, float2 &b) { // overloading += a.x += b.x; From 9a8588ac111e691a74bb5d98a6b11f699984f910 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Wed, 26 Apr 2023 11:26:01 +0300 Subject: [PATCH 26/76] More fixes of unwanted ADL usage of std algos (#4828) This continues the work started in #4809. --- src/fstext/determinize-star-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h index e9650ca29a6..36c9ba397a6 100644 --- a/src/fstext/determinize-star-inl.h +++ b/src/fstext/determinize-star-inl.h @@ -725,7 +725,7 @@ void DeterminizerStar::EpsilonClosure:: { // this sorting is based on StateId - sort(ecinfo_.begin(), ecinfo_.end()); + std::sort(ecinfo_.begin(), ecinfo_.end()); output_subset->clear(); From 19185083f4ce3f74d7b2fc7494b8ea530feeab01 Mon Sep 17 00:00:00 2001 From: "Nickolay V. Shmyrev" Date: Wed, 26 Apr 2023 11:27:05 +0300 Subject: [PATCH 27/76] Fix matrix data offset for large matrices (#4823) * Fix matrix data offset for large matrices * Fix overflow in cudamatrix too --- src/cudamatrix/cu-matrix.h | 14 +++++++------- src/matrix/kaldi-matrix.h | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index a531ecd45b9..3ffe67d8b06 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -231,7 +231,7 @@ class CuMatrixBase { bool ApproxEqual(const CuMatrixBase &other, float tol = 0.01) const; /// Get size of matrix in bytes - MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); } + size_t SizeInBytes() const { return static_cast(num_rows_)*static_cast(stride_)*sizeof(Real); } // Copy functions. These do not resize. template @@ -670,13 +670,13 @@ class CuMatrixBase { inline const CuSubVector Row(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return CuSubVector(data_ + (i * stride_), NumCols()); + return CuSubVector(data_ + (static_cast(i) * static_cast(stride_)), NumCols()); } inline CuSubVector Row(MatrixIndexT i) { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return CuSubVector(data_ + (i * stride_), NumCols()); + return CuSubVector(data_ + (static_cast(i) * static_cast(stride_)), NumCols()); } inline CuValue operator() (MatrixIndexT r, MatrixIndexT c) { @@ -684,7 +684,7 @@ class CuMatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return CuValue(data_ + r * stride_ + c); + return CuValue(data_ + static_cast(r) * static_cast(stride_) + c); } inline Real operator() (MatrixIndexT r, MatrixIndexT c) const { @@ -692,7 +692,7 @@ class CuMatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return CuValue(data_ + r * stride_ + c); // will be casted to Real. + return CuValue(data_ + static_cast(r) * static_cast(stride_) + c); // will be casted to Real. } Real Sum() const; @@ -737,10 +737,10 @@ class CuMatrixBase { /// Get raw row pointer (const). Warning: may return a pointer to GPU memory. Use at /// your own risk. - inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; } + inline const Real* RowData(MatrixIndexT r) const { return data_ + static_cast(r) * static_cast(stride_); } /// Get raw row pointer. Warning: may return a pointer to GPU memory. Use at /// your own risk. - inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; } + inline Real* RowData(MatrixIndexT r) { return data_ + static_cast(r) * static_cast(stride_); } /// Return data pointer (const). Warning: may return a pointer to GPU memory. /// Use at your own risk. inline const Real *Data() const { return data_; } diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index bc95c9189f6..064edf4237b 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -87,14 +87,14 @@ class MatrixBase { inline Real* RowData(MatrixIndexT i) { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return data_ + i * stride_; + return data_ + static_cast(i) * static_cast(stride_); } /// Returns pointer to data for one row (const) inline const Real* RowData(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return data_ + i * stride_; + return data_ + static_cast(i) * static_cast(stride_); } /// Indexing operator, non-const @@ -104,7 +104,7 @@ class MatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return *(data_ + r * stride_ + c); + return *(data_ + static_cast(r) * static_cast(stride_) + c); } /// Indexing operator, provided for ease of debugging (gdb doesn't work /// with parenthesis operator). @@ -117,7 +117,7 @@ class MatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return *(data_ + r * stride_ + c); + return *(data_ + static_cast(r) * static_cast(stride_) + c); } /* Basic setting-to-special values functions. */ @@ -188,14 +188,14 @@ class MatrixBase { inline const SubVector Row(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return SubVector(data_ + (i * stride_), NumCols()); + return SubVector(data_ + (static_cast(i) * static_cast(stride_)), NumCols()); } /// Return specific row of matrix. inline SubVector Row(MatrixIndexT i) { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return SubVector(data_ + (i * stride_), NumCols()); + return SubVector(data_ + (static_cast(i) * static_cast(stride_)), NumCols()); } /// Return a sub-part of matrix. From 40fa1487f89a076cca75178bd2ddd73edb07dff9 Mon Sep 17 00:00:00 2001 From: sendream <1149593720@qq.com> Date: Thu, 27 Apr 2023 15:27:20 +0800 Subject: [PATCH 28/76] Add recipe of Tibetan Amdo dialect --- egs/xbmu_amdo31/README.txt | 11 + egs/xbmu_amdo31/s5/RESULTS | 8 + egs/xbmu_amdo31/s5/cmd.sh | 15 ++ egs/xbmu_amdo31/s5/conf/decode.config | 5 + egs/xbmu_amdo31/s5/conf/mfcc.conf | 2 + egs/xbmu_amdo31/s5/conf/mfcc_hires.conf | 10 + egs/xbmu_amdo31/s5/conf/online_cmvn.conf | 1 + egs/xbmu_amdo31/s5/conf/online_pitch.conf | 4 + egs/xbmu_amdo31/s5/conf/pitch.conf | 1 + egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh | 1 + .../s5/local/chain/tuning/run_tdnn_1a.sh | 184 +++++++++++++++ .../s5/local/chain/tuning/run_tdnn_2a.sh | 211 ++++++++++++++++++ .../s5/local/download_and_untar.sh | 105 +++++++++ .../s5/local/nnet3/run_ivector_common.sh | 159 +++++++++++++ egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh | 1 + .../s5/local/nnet3/tuning/run_tdnn_1a.sh | 117 ++++++++++ .../s5/local/nnet3/tuning/run_tdnn_2a.sh | 145 ++++++++++++ egs/xbmu_amdo31/s5/local/score.sh | 8 + egs/xbmu_amdo31/s5/local/wer_hyp_filter | 19 ++ egs/xbmu_amdo31/s5/local/wer_output_filter | 25 +++ egs/xbmu_amdo31/s5/local/wer_ref_filter | 19 ++ .../s5/local/xbmu_amdo31_data_prep.sh | 77 +++++++ .../s5/local/xbmu_amdo31_prepare_dict.sh | 36 +++ .../s5/local/xbmu_amdo31_train_lms.sh | 88 ++++++++ egs/xbmu_amdo31/s5/path.sh | 6 + egs/xbmu_amdo31/s5/run.sh | 156 +++++++++++++ egs/xbmu_amdo31/s5/steps | 1 + egs/xbmu_amdo31/s5/utils | 1 + 28 files changed, 1416 insertions(+) create mode 100644 egs/xbmu_amdo31/README.txt create mode 100644 egs/xbmu_amdo31/s5/RESULTS create mode 100644 egs/xbmu_amdo31/s5/cmd.sh create mode 100644 egs/xbmu_amdo31/s5/conf/decode.config create mode 100644 egs/xbmu_amdo31/s5/conf/mfcc.conf create mode 100644 egs/xbmu_amdo31/s5/conf/mfcc_hires.conf create mode 100644 egs/xbmu_amdo31/s5/conf/online_cmvn.conf create mode 100644 egs/xbmu_amdo31/s5/conf/online_pitch.conf create mode 100644 egs/xbmu_amdo31/s5/conf/pitch.conf create mode 120000 egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh create mode 100755 egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh create mode 100755 egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh create mode 100755 egs/xbmu_amdo31/s5/local/download_and_untar.sh create mode 100755 egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh create mode 120000 egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh create mode 100755 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh create mode 100755 egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh create mode 100755 egs/xbmu_amdo31/s5/local/score.sh create mode 100755 egs/xbmu_amdo31/s5/local/wer_hyp_filter create mode 100755 egs/xbmu_amdo31/s5/local/wer_output_filter create mode 100755 egs/xbmu_amdo31/s5/local/wer_ref_filter create mode 100755 egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh create mode 100755 egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh create mode 100755 egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh create mode 100755 egs/xbmu_amdo31/s5/path.sh create mode 100755 egs/xbmu_amdo31/s5/run.sh create mode 120000 egs/xbmu_amdo31/s5/steps create mode 120000 egs/xbmu_amdo31/s5/utils diff --git a/egs/xbmu_amdo31/README.txt b/egs/xbmu_amdo31/README.txt new file mode 100644 index 00000000000..d2cda16fa58 --- /dev/null +++ b/egs/xbmu_amdo31/README.txt @@ -0,0 +1,11 @@ +About the XBMU-AMDO31 corpus XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University. + +XBMU-AMDO31 dataset is a speech recognition corpus of Tibetan Amdo dialect. The open source corpus contains 31 hours of speech data and resources related to build speech recognition systems,including transcribed texts and a Tibetan pronunciation lexicon. (The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused for the Amdo dialect because of the uniformity of the Tibetan language) The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR). + +The database can be downloaded from openslr: +http://www.openslr.org/133/ + +For more details, please visit: +https://huggingface.co/datasets/syzym/xbmu_amdo31 + +This recipe includes some different ASR models trained with XBMU-AMDO31. \ No newline at end of file diff --git a/egs/xbmu_amdo31/s5/RESULTS b/egs/xbmu_amdo31/s5/RESULTS new file mode 100644 index 00000000000..e50e43dc4db --- /dev/null +++ b/egs/xbmu_amdo31/s5/RESULTS @@ -0,0 +1,8 @@ +%WER 46.16 [ 15522 / 33628, 380 ins, 2208 del, 12934 sub ] exp/mono/decode_test/wer_10_0.0 +%WER 24.60 [ 8274 / 33628, 330 ins, 860 del, 7084 sub ] exp/tri1/decode_test/wer_13_0.0 +%WER 24.42 [ 8213 / 33628, 323 ins, 847 del, 7043 sub ] exp/tri2/decode_test/wer_13_0.0 +%WER 22.93 [ 7712 / 33628, 336 ins, 814 del, 6562 sub ] exp/tri3a/decode_test/wer_12_0.0 +%WER 20.17 [ 6783 / 33628, 275 ins, 764 del, 5744 sub ] exp/tri4a/decode_test/wer_15_0.0 +%WER 19.03 [ 6400 / 33628, 292 ins, 667 del, 5441 sub ] exp/tri5a/decode_test/wer_14_0.0 +%WER 15.45 [ 5196 / 33628, 229 ins, 646 del, 4321 sub ] exp/nnet3/tdnn_sp/decode_test/wer_16_0.0 +%WER 15.57 [ 5235 / 33628, 244 ins, 575 del, 4416 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_11_0.0 diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh new file mode 100644 index 00000000000..1ba3f789bf8 --- /dev/null +++ b/egs/xbmu_amdo31/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd=run.pl +export decode_cmd=run.pl +export mkgraph_cmd=run.pl diff --git a/egs/xbmu_amdo31/s5/conf/decode.config b/egs/xbmu_amdo31/s5/conf/decode.config new file mode 100644 index 00000000000..d91f86183af --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/decode.config @@ -0,0 +1,5 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + + + diff --git a/egs/xbmu_amdo31/s5/conf/mfcc.conf b/egs/xbmu_amdo31/s5/conf/mfcc.conf new file mode 100644 index 00000000000..a1aa3d6c158 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=16000 diff --git a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..ca067e77b37 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800) diff --git a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..591367e7ae9 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster. diff --git a/egs/xbmu_amdo31/s5/conf/online_pitch.conf b/egs/xbmu_amdo31/s5/conf/online_pitch.conf new file mode 100644 index 00000000000..c0f1342160d --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/online_pitch.conf @@ -0,0 +1,4 @@ +--sample-frequency=16000 +--simulate-first-pass-online=true +--normalization-right-context=25 +--frames-per-chunk=10 diff --git a/egs/xbmu_amdo31/s5/conf/pitch.conf b/egs/xbmu_amdo31/s5/conf/pitch.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..0c7ddcfe471 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +# This script is based on run_tdnn_7h.sh in swbd chain recipe. + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_1a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=1 +num_jobs_final=2 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 5 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done +fi + +exit; diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh new file mode 100755 index 00000000000..669a014e8cf --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash + +# This script is based on run_tdnn_1a.sh. +# This setup used online pitch to train the neural network. +# It requires a online_pitch.conf in the conf dir. + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_2a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires_online \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 14 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +dir=${dir}_online +if [ $stage -le 15 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 16 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" --per-utt true \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1; + done +fi + +exit; diff --git a/egs/xbmu_amdo31/s5/local/download_and_untar.sh b/egs/xbmu_amdo31/s5/local/download_and_untar.sh new file mode 100755 index 00000000000..9c70836bf46 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/download_and_untar.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_aishell, resource_aishell." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_aishell resource_aishell" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="15582913665 1246920" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +if [ $part == "data_aishell" ]; then + cd $data/$part/wav + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..70d492b2774 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="dev test" +gmm=tri5a +online=false +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_sp_ali + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +online_affix= +if [ $online = true ]; then + online_affix=_online +fi + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \ + exp/make_mfcc/train_sp mfcc_perturbed || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp \ + exp/make_mfcc/train_sp mfcc_perturbed || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=mfcc_perturbed_hires$online_affix + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1; + # create MFCC data dir without pitch to extract iVector + utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh new file mode 100755 index 00000000000..a5b129be31c --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash + +# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh + +# In this script, the neural network in trained based on hires mfcc and online pitch. +# The online pitch setup requires a online_pitch.conf in the conf dir for both training +# and testing. + +set -e + +stage=0 +train_stage=-10 +affix= +common_egs_dir= + +# training options +initial_effective_lrate=0.0015 +final_effective_lrate=0.00015 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=12 +remove_egs=true + +# feature options +use_ivectors=true + +# End configuration section. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires_online \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 10 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +if [ $stage -le 11 ]; then + # do the actual online decoding with iVectors, carrying info forward from + # previous utterances of the same speaker. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}_online/decode_$decode_set + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 12 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}_online/decode_${decode_set}_per_utt + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config --per-utt true \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/score.sh b/egs/xbmu_amdo31/s5/local/score.sh new file mode 100755 index 00000000000..d283ceb68dc --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/score.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +set -e -o pipefail +set -x +steps/score_kaldi.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + +echo "$0: Done" diff --git a/egs/xbmu_amdo31/s5/local/wer_hyp_filter b/egs/xbmu_amdo31/s5/local/wer_hyp_filter new file mode 100755 index 00000000000..c6660e4efe1 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/wer_hyp_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('',''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/xbmu_amdo31/s5/local/wer_output_filter b/egs/xbmu_amdo31/s5/local/wer_output_filter new file mode 100755 index 00000000000..aceeeec41b4 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +while (<>) { + @F = split " "; + print $F[0] . " "; + foreach $s (@F[1..$#F]) { + if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) { + print ""; + } else { + print "$s" + } + print " "; + } + print "\n"; +} + + diff --git a/egs/xbmu_amdo31/s5/local/wer_ref_filter b/egs/xbmu_amdo31/s5/local/wer_ref_filter new file mode 100755 index 00000000000..c6660e4efe1 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/wer_ref_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('',''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh new file mode 100755 index 00000000000..5cda85774a7 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +# Copyright 2017 Xingyu Na +# 2021 Northwest Minzu University (senyan Li) +#Apache 2.0 + +. ./path.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " $0 /export/data/xbmu_amdo31/data/wav /export/data/xbmu_amdo31/data/transcript" + exit 1; +fi + +tibetan_audio_dir=$1 +tibetan_text=$2/transcript_clean.txt + +train_dir=data/local/train +dev_dir=data/local/dev +test_dir=data/local/test +tmp_dir=data/local/tmp + +mkdir -p $train_dir +mkdir -p $dev_dir +mkdir -p $test_dir +mkdir -p $tmp_dir + +# data directory check +if [ ! -d $tibetan_audio_dir ] || [ ! -f $tibetan_text ]; then + echo "Error: $0 requires two directory arguments" + exit 1; +fi +echo $tibetan_audio_dir +# find wav audio file for train, dev and test resp. +find $tibetan_audio_dir -iname "*.wav" > $tmp_dir/wav.flist +n=`cat $tmp_dir/wav.flist | wc -l` +[ $n -ne 22630 ] && \ + echo Warning: expected 141925 data data files, found $n + +grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; +grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; +grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; + +rm -r $tmp_dir +# Transcriptions preparation +# cat $tibetan_text |head -10 +for dir in $train_dir $dev_dir $test_dir; do + echo Preparing $dir transcriptions + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all + rm -f $dir/transcripts1.txt + cat $dir/utt.list |while read line + do + line1=`echo $line |cut -d "-" -f 2` + line2=`grep -w $line1 $tibetan_text |cut -d " " -f 2-` + text=$line" "$line2 + echo $text >>$dir/transcripts1.txt + done + paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all + utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt + awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk + utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp + sort -u $dir/transcripts.txt > $dir/text + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +done + +mkdir -p data/train data/dev data/test + +for f in spk2utt utt2spk wav.scp text; do + cp $train_dir/$f data/train/$f || exit 1; + cp $dev_dir/$f data/dev/$f || exit 1; + cp $test_dir/$f data/test/$f || exit 1; +done + +echo "$0: tibetan data preparation succeeded" +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh new file mode 100755 index 00000000000..1e5537858ff --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +# prepare dict resources + +. ./path.sh + +[ $# != 1 ] && echo "Usage: $0 " && exit 1; + +res_dir=$1 +dict_dir=data/local/dict +mkdir -p $dict_dir +cp $res_dir/lexicon.txt $dict_dir + +cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ + perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil"); + m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; } + foreach $l (values %q) {print "$l\n";} + ' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1; + +echo sil > $dict_dir/silence_phones.txt + +echo sil > $dict_dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone + +cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1; +cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dict_dir/extra_questions.txt || exit 1; + +echo "$0: Tibetan dict preparation succeeded" +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh new file mode 100755 index 00000000000..eaca5e2fafa --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/train/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/train/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +kaldi_lm=`which train_lm.sh` +if [ -z $kaldi_lm ]; then + echo "$0: train_lm.sh is not found. That might mean it's not installed" + echo "$0: or it is not added to PATH" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" + exit 1 +fi + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# LM is small enough that we don't need to prune it (only about 0.7M N-grams). +# Perplexity over 128254.000000 words is 90.446690 + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + +exit 0 + + +# From here is some commands to do a baseline with SRILM (assuming +# you have it installed). +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train + +cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist + + +ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 + +# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. +# Difference in WSJ must have been due to different treatment of . +ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/xbmu_amdo31/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/xbmu_amdo31/s5/run.sh b/egs/xbmu_amdo31/s5/run.sh new file mode 100755 index 00000000000..61b3e8f62d8 --- /dev/null +++ b/egs/xbmu_amdo31/s5/run.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash + +# Copyright Copyright 2021 Northwest Minzu University (Authors: Senyan Li) +# 2017 Hui Bu +# 2017 Jiayu Du +# 2017 Xingyu Na +# 2017 Bengu Wu +# 2017 Hao Zheng +# Apache 2.0 + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. +# Caution: some of the graph creation steps use quite a bit of memory, so you +# should run this on a machine that has sufficient memory. + +# corpus directory and download URL +data=/home1/lsy/kaldi/egs/xbmu_amdo31/s5/export/data +data_url=www.openslr.org/resources/133 + +. ./cmd.sh + +#local/download_and_untar.sh $data $data_url xbmu-amdo31 || exit 1; + +# Lexicon Preparation, +local/xbmu_amdo31_prepare_dict.sh $data/xbmu_amdo31/resource || exit 1; + +# Data Preparation, +local/xbmu_amdo31_data_prep.sh $data/xbmu_amdo31/data/wav $data/xbmu_amdo31/data/transcript || exit 1; + +# Phone Sets, questions, L compilation +utils/prepare_lang.sh --position-dependent-phones false data/local/dict \ + "" data/local/lang data/lang || exit 1; + +# LM training +local/xbmu_amdo31_train_lms.sh || exit 1; + +# G compilation, check LG composition +utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \ + data/local/dict/lexicon.txt data/lang_test || exit 1; + +# Now make MFCC plus pitch features. +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc +for x in train dev test; do + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + utils/fix_data_dir.sh data/$x || exit 1; +done + +# Train a monophone model on delta features. +steps/train_mono.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/mono || exit 1; + +# Decode with the monophone model. +utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/mono/graph data/dev exp/mono/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/mono/graph data/test exp/mono/decode_test + +# Get alignments from monophone system. +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + +# Train the first triphone pass model tri1 on delta + delta-delta features. +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; + +# decode tri1 +utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri1/graph data/dev exp/tri1/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri1/graph data/test exp/tri1/decode_test + +# align tri1 +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + +# train tri2 [delta+delta-deltas] +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; + +# decode tri2 +utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri2/graph data/dev exp/tri2/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri2/graph data/test exp/tri2/decode_test + +# Align training data with the tri2 model. +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri2 exp/tri2_ali || exit 1; + +# Train the second triphone pass model tri3a on LDA+MLLT features. +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1; + +# Run a test decode with the tri3a model. +utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri3a/graph data/dev exp/tri3a/decode_dev +steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri3a/graph data/test exp/tri3a/decode_test + +# align tri3a with fMLLR + +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; + +# Train the third triphone pass model tri4a on LDA+MLLT+SAT features. +# From now on, we start building a more serious system with Speaker +# Adaptive Training (SAT). +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1; + +# decode tri4a +utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri4a/graph data/dev exp/tri4a/decode_dev +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri4a/graph data/test exp/tri4a/decode_test + +# align tri4a with fMLLR +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri4a exp/tri4a_ali + +# Train tri5a, which is LDA+MLLT+SAT +# Building a larger SAT system. You can see the num-leaves is 3500 and tot-gauss is 100000 + +steps/train_sat.sh --cmd "$train_cmd" \ + 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + +# decode tri5a +utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1; +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1; +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1; + +# align tri5a with fMLLR +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri5a exp/tri5a_ali || exit 1; + +# nnet3 +local/nnet3/run_tdnn.sh + +# chain +local/chain/run_tdnn.sh + +# getting results (see RESULTS file) +for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null +for x in exp/*/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null + +exit 0; diff --git a/egs/xbmu_amdo31/s5/steps b/egs/xbmu_amdo31/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/xbmu_amdo31/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/xbmu_amdo31/s5/utils b/egs/xbmu_amdo31/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/xbmu_amdo31/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From 3947a4be7815a968a1ffc0d3def8a7afd9949461 Mon Sep 17 00:00:00 2001 From: sendream <1149593720@qq.com> Date: Thu, 27 Apr 2023 17:32:02 +0800 Subject: [PATCH 29/76] Modify code to conform to ShellCheck specifications --- egs/xbmu_amdo31/s5/cmd.sh | 6 +++--- egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh | 2 +- egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh | 2 +- egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh | 2 +- egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh | 2 +- egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh | 8 +++++--- egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh | 10 +++++----- egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh | 2 +- egs/xbmu_amdo31/s5/path.sh | 2 +- 9 files changed, 19 insertions(+), 17 deletions(-) diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh index 1ba3f789bf8..71dd849a93b 100644 --- a/egs/xbmu_amdo31/s5/cmd.sh +++ b/egs/xbmu_amdo31/s5/cmd.sh @@ -10,6 +10,6 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd=run.pl -export decode_cmd=run.pl -export mkgraph_cmd=run.pl +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh index 0c7ddcfe471..826aa163f2a 100755 --- a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh @@ -27,7 +27,7 @@ common_egs_dir= xent_regularize=0.1 # End configuration section. -echo "$0 $@" # Print the command line for logging +echo "$0 $*" # Print the command line for logging . ./cmd.sh . ./path.sh diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh index 669a014e8cf..52d56adbc60 100755 --- a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh +++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh @@ -29,7 +29,7 @@ common_egs_dir= xent_regularize=0.1 # End configuration section. -echo "$0 $@" # Print the command line for logging +echo "$0 $*" # Print the command line for logging . ./cmd.sh . ./path.sh diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh index 70d492b2774..610774fb2a2 100755 --- a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh +++ b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh @@ -90,7 +90,7 @@ if [ $stage -le 4 ]; then temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm num_utts_total=$(wc -l $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` +n=$(wc -l < "$tmp_dir/wav.flist") [ $n -ne 22630 ] && \ echo Warning: expected 141925 data data files, found $n @@ -49,13 +49,13 @@ for dir in $train_dir $dev_dir $test_dir; do sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all rm -f $dir/transcripts1.txt - cat $dir/utt.list |while read line + while read -r line do - line1=`echo $line |cut -d "-" -f 2` - line2=`grep -w $line1 $tibetan_text |cut -d " " -f 2-` + line1=$(echo "$line" | cut -d '-' -f 2) + line2=$(grep -w $line1 $tibetan_text |cut -d " " -f 2-) text=$line" "$line2 echo $text >>$dir/transcripts1.txt - done + done < "$dir/utt.list" paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt awk '{print $1}' $dir/transcripts.txt > $dir/utt.list diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh index eaca5e2fafa..658f0e7bc15 100755 --- a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh @@ -19,7 +19,7 @@ done dir=data/local/lm mkdir -p $dir -kaldi_lm=`which train_lm.sh` +kaldi_lm=$(command -v train_lm.sh) if [ -z $kaldi_lm ]; then echo "$0: train_lm.sh is not found. That might mean it's not installed" echo "$0: or it is not added to PATH" diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh index 2d17b17a84a..b70ffbfbb26 100755 --- a/egs/xbmu_amdo31/s5/path.sh +++ b/egs/xbmu_amdo31/s5/path.sh @@ -1,4 +1,4 @@ -export KALDI_ROOT=`pwd`/../../.. +export KALDI_ROOT=$(pwd)/../../.. [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 From cbc728497deded941940666292c418490ac87bce Mon Sep 17 00:00:00 2001 From: sendream <1149593720@qq.com> Date: Thu, 27 Apr 2023 17:46:53 +0800 Subject: [PATCH 30/76] Modify code to conform to ShellCheck specifications --- egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh index 6936f389bbb..3f920315b77 100755 --- a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh +++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh @@ -122,7 +122,7 @@ if [ $stage -le 11 ]; then # previous utterances of the same speaker. for decode_set in dev test; do # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` - num_jobs=$(cat "data/${decode_set}_hires_online/utt2spk" | cut -d' ' -f2 | sort -u | wc -l) + num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l) decode_dir=${dir}_online/decode_$decode_set steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ --config conf/decode.config \ @@ -135,7 +135,7 @@ if [ $stage -le 12 ]; then # without carrying forward speaker information. for decode_set in dev test; do # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` - num_jobs=$(cat "data/${decode_set}_hires_online/utt2spk" | cut -d' ' -f2 | sort -u | wc -l) + num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l) decode_dir=${dir}_online/decode_${decode_set}_per_utt steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ --config conf/decode.config --per-utt true \ From 0d7f17f3303bdcd5bfab4bdd5714bbd26dd2631a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 2 May 2023 23:44:15 +0800 Subject: [PATCH 31/76] Fix download location in install_liblbfgs.sh --- tools/extras/install_liblbfgs.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) mode change 100644 => 100755 tools/extras/install_liblbfgs.sh diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh old mode 100644 new mode 100755 index 10f72cad84f..8a726dd144d --- a/tools/extras/install_liblbfgs.sh +++ b/tools/extras/install_liblbfgs.sh @@ -1,7 +1,9 @@ #!/bin/bash + VER=1.10 -if [ ! -f liblbfgs-$VER.tar.gz ]; then - wget https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz +if [ ! -f liblbfgs$VER.tar.gz ]; then + wget https://danielpovey.com/files/liblbfgs-1.10.tar.gz + ## wget https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz fi tar -xzf liblbfgs-$VER.tar.gz @@ -29,4 +31,3 @@ cd .. echo "export LIBLBFGS=$wd/liblbfgs-1.10" echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs ) >> env.sh - From 039ccbf26d241cf40d2e85a7bddadd97d06f5b5d Mon Sep 17 00:00:00 2001 From: Baffin Lee Date: Fri, 5 May 2023 11:47:26 +0000 Subject: [PATCH 32/76] [egs] convert tuple to NDArray before call sklearn.manifold.TSNE --- egs/gop_speechocean762/s5/local/visualize_feats.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/egs/gop_speechocean762/s5/local/visualize_feats.py b/egs/gop_speechocean762/s5/local/visualize_feats.py index 3b3ddaa037a..202c6a57b6b 100644 --- a/egs/gop_speechocean762/s5/local/visualize_feats.py +++ b/egs/gop_speechocean762/s5/local/visualize_feats.py @@ -8,6 +8,7 @@ import random import kaldi_io import seaborn as sns +import numpy as np from collections import Counter from sklearn.manifold import TSNE from utils import load_human_scores, load_phone_symbol_table @@ -62,6 +63,9 @@ def main(): min(args.samples, len(lables))) features, lables = list(zip(*sampled_paris)) + # Convert the tuple of arrays to a single 2D array + features = np.vstack(features) + # Draw scatters label_counter = Counter(lables) colors = sns.color_palette("colorblind", len(label_counter)) From f5805db451000c705bcefe8ef01658fb979f5cce Mon Sep 17 00:00:00 2001 From: Stu Hilton Date: Fri, 11 Aug 2023 12:41:12 -0500 Subject: [PATCH 33/76] Create Dockerfile Creates a Dockerfile in support of Ubuntu 22.04. --- docker/ubuntu22.04-cuda12.2.0/Dockerfile | 48 ++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docker/ubuntu22.04-cuda12.2.0/Dockerfile diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile new file mode 100644 index 00000000000..1d247399c75 --- /dev/null +++ b/docker/ubuntu22.04-cuda12.2.0/Dockerfile @@ -0,0 +1,48 @@ +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 +LABEL maintainer="williamhilton.works@gmail.com" + +RUN apt update && \ + apt install -y \ + software-properties-common && \ + apt-add-repository multiverse && \ + apt update && \ + apt install -y \ + build-essential \ + g++ \ + make \ + automake \ + bzip2 \ + unzip \ + wget \ + sox \ + libtool \ + git \ + subversion \ + python2.7 \ + python3 \ + zlib1g-dev \ + ca-certificates \ + gfortran \ + patch \ + ffmpeg \ + vim \ + python2-dev \ + python3-dev && \ + apt update && \ + yes | DEBIAN_FRONTEND=noninteractive apt install -yqq \ + intel-mkl && \ + rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python2.7 /usr/bin/python + +RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ + cd /opt/kaldi/tools && \ + make -j $(nproc) && \ + cd /opt/kaldi/src && \ + ./configure --shared --use-cuda && \ + make depend -j $(nproc) && \ + make -j $(nproc) && \ + find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \ + rm -rf /opt/kaldi/.git + +WORKDIR /opt/kaldi/ From 745c6e1f5c0b38c8bcbdfdd69c01b83ef7206e3d Mon Sep 17 00:00:00 2001 From: Stu Hilton Date: Fri, 11 Aug 2023 13:46:43 -0500 Subject: [PATCH 34/76] Update Dockerfile Reduces image size and re-organizes install list for clarity. --- docker/ubuntu22.04-cuda12.2.0/Dockerfile | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile index 1d247399c75..6e6ea2e7ce0 100644 --- a/docker/ubuntu22.04-cuda12.2.0/Dockerfile +++ b/docker/ubuntu22.04-cuda12.2.0/Dockerfile @@ -2,11 +2,7 @@ FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 LABEL maintainer="williamhilton.works@gmail.com" RUN apt update && \ - apt install -y \ - software-properties-common && \ - apt-add-repository multiverse && \ - apt update && \ - apt install -y \ + apt install -y --no-install-recommends \ build-essential \ g++ \ make \ @@ -25,11 +21,13 @@ RUN apt update && \ gfortran \ patch \ ffmpeg \ - vim \ - python2-dev \ - python3-dev && \ + vim && \ + apt update && \ + apt install -y --no-install-recommends\ + software-properties-common && \ + apt-add-repository multiverse && \ apt update && \ - yes | DEBIAN_FRONTEND=noninteractive apt install -yqq \ + yes | DEBIAN_FRONTEND=noninteractive apt install -yqq --no-install-recommends\ intel-mkl && \ rm -rf /var/lib/apt/lists/* From ebf624594ea46f0872f5de201b3c217999f3d8fc Mon Sep 17 00:00:00 2001 From: Stu Hilton Date: Sat, 12 Aug 2023 09:31:36 -0500 Subject: [PATCH 35/76] Update Dockerfile Using apt-get instead of apt --- docker/ubuntu22.04-cuda12.2.0/Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/ubuntu22.04-cuda12.2.0/Dockerfile b/docker/ubuntu22.04-cuda12.2.0/Dockerfile index 6e6ea2e7ce0..ae413def077 100644 --- a/docker/ubuntu22.04-cuda12.2.0/Dockerfile +++ b/docker/ubuntu22.04-cuda12.2.0/Dockerfile @@ -1,8 +1,8 @@ FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 LABEL maintainer="williamhilton.works@gmail.com" -RUN apt update && \ - apt install -y --no-install-recommends \ +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ build-essential \ g++ \ make \ @@ -22,12 +22,12 @@ RUN apt update && \ patch \ ffmpeg \ vim && \ - apt update && \ - apt install -y --no-install-recommends\ + apt-get update && \ + apt-get install -y --no-install-recommends\ software-properties-common && \ apt-add-repository multiverse && \ - apt update && \ - yes | DEBIAN_FRONTEND=noninteractive apt install -yqq --no-install-recommends\ + apt-get update && \ + yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\ intel-mkl && \ rm -rf /var/lib/apt/lists/* From aef1d98603b68e6cf3a973e9dcd71915e2a175fe Mon Sep 17 00:00:00 2001 From: Egor Tyuvaev Date: Fri, 13 Oct 2023 11:19:51 +0200 Subject: [PATCH 36/76] Update install_mkl.sh Update Intel APT key to mitigate installation error --- tools/extras/install_mkl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh index 8c1899bdf2f..ddcd372a02c 100755 --- a/tools/extras/install_mkl.sh +++ b/tools/extras/install_mkl.sh @@ -16,7 +16,7 @@ default_package=intel-mkl-64bit-2020.0-088 yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo' apt_repo='https://apt.repos.intel.com/mkl' -intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB' +intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB' Usage () { cat >&2 < Date: Fri, 20 Oct 2023 11:48:09 +0000 Subject: [PATCH 37/76] Fix __CUDA_ARCH__ issue and add more hipification. --- src/chain/chain-kernels.cu | 1 + src/cudafeat/feature-online-cmvn-cuda.cu | 1 + src/cudafeatbin/Makefile | 8 +++++--- src/cudafeatbin/apply-batched-cmvn-online-cuda.cc | 2 ++ .../compute-fbank-online-batched-cuda.cc | 2 ++ .../compute-mfcc-online-batched-cuda.cc | 2 ++ .../compute-online-feats-batched-cuda.cc | 2 ++ src/cudafeatbin/compute-online-feats-cuda.cc | 2 ++ src/cudamatrix/cu-kernels.cu | 1 + src/hip/hipify.h | 5 +++++ src/makefiles/hip_64bit.mk | 15 +++++++++++---- 11 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 2a30128750c..ad6691fc895 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -21,6 +21,7 @@ #include "chain/chain-kernels-ansi.h" #ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 #include #endif diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index 8d4648d04bb..1c896f1307f 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -16,6 +16,7 @@ // limitations under the License. #ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 #include #include "hipify.h" #else diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile index 9dbb5d30fa1..ed1c413c939 100644 --- a/src/cudafeatbin/Makefile +++ b/src/cudafeatbin/Makefile @@ -3,12 +3,14 @@ all: ; EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, -ifndef CUDA_ARCH - $(error CUDA_ARCH is undefined, run 'src/configure') +ifeq ($(CUDA), true) + ifndef CUDA_ARCH + $(error CUDA_ARCH is undefined, run 'src/configure') + endif endif LDFLAGS += $(CUDA_LDFLAGS) diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc index 24e7cbd4a70..44ef403f21a 100644 --- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc +++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc @@ -18,8 +18,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc index 36cfc4ad90c..ff9415b8f11 100644 --- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc +++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc index 99883f3114a..3fcc1aea659 100644 --- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc +++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc index 787aceeca0d..2cd6bbb6a93 100644 --- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc +++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc @@ -16,9 +16,11 @@ // limitations under the License. #if HAVE_CUDA +#ifndef __IS_HIP_COMPILE__ #include #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc index b9135c3cee6..70380f8ccad 100644 --- a/src/cudafeatbin/compute-online-feats-cuda.cc +++ b/src/cudafeatbin/compute-online-feats-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include "base/kaldi-common.h" #include "util/common-utils.h" #include "cudafeat/online-cuda-feature-pipeline.h" diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 1d6e0664541..1b0cf1f2c90 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -30,6 +30,7 @@ #include #include #ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 #include #include "hipify.h" #include "cudamatrix/cu-kernels-ansi.h" diff --git a/src/hip/hipify.h b/src/hip/hipify.h index b631ac08a23..723b5b1f059 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -148,6 +148,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaHostRegister hipHostRegister #define cudaHostRegisterDefault hipHostRegisterDefault #define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc #define cudaMalloc hipMalloc #define cudaMallocHost hipHostMalloc #define cudaMallocPitch hipMallocPitch @@ -157,12 +158,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemGetInfo hipMemGetInfo #define cudaMemset2DAsync hipMemset2DAsync #define cudaMemsetAsync hipMemsetAsync +#define cudaProfilerStop hipProfilerStop #define cudaSetDevice hipSetDevice #define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamDestroy hipStreamDestroy #define cudaStreamLegacy ((hipStream_t)1) +#define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamPerThread ((hipStream_t)2) #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index e2f43ecd55c..8d85872aa9b 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -29,12 +29,19 @@ ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \ -I$(ROCMDIR)/rocrand/include \ -I$(ROCMDIR)/include \ -I.. -I../hip -isystem $(OPENFSTINC) + +# TODO: Consider passing __CUDA_ARCH__=800 here as it is mostly supported by ROCm. +# However this macro has some side effect with HIPCC that makes it assume +# CUDA is active and everything is device compiles. ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ - -D__CUDACC_VER_MAJOR__=11 -D__CUDA_ARCH__=800 -DCUDA_VERSION=11000 \ - -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 + -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \ + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics -#TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. +# TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. +# We allow the libraries we link against to have undefined symbols so as this can be build in +# systems with no development version of these libraries (e.g. ncurses). CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib -CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 +CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined +LDLIBS += -Wl,--allow-shlib-undefined From 3a8896c2a3bd13835e45b11eed6f2ce0044d5260 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 1 Nov 2023 20:13:34 +0800 Subject: [PATCH 38/76] Fix LatticeSimpleDecoder --- src/decoder/lattice-simple-decoder.cc | 2 +- src/gmm/mle-diag-gmm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc index cc8712e854d..d6b0727ef07 100644 --- a/src/decoder/lattice-simple-decoder.cc +++ b/src/decoder/lattice-simple-decoder.cc @@ -571,7 +571,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() { } if (queue.empty()) { if (!warned_) { - KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is " + KALDI_LOG << "Error in ProcessNonEmitting: no surviving tokens: frame is " << frame; warned_ = true; } diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h index d41d36489bf..3763943a89b 100644 --- a/src/gmm/mle-diag-gmm.h +++ b/src/gmm/mle-diag-gmm.h @@ -93,7 +93,7 @@ struct MapDiagGmmOptions { void Register(OptionsItf *opts) { opts->Register("mean-tau", &mean_tau, "Tau value for updating means."); - opts->Register("variance-tau", &mean_tau, + opts->Register("variance-tau", &variance_tau, "Tau value for updating variances (note: only relevant if " "update-flags contains \"v\"."); opts->Register("weight-tau", &weight_tau, From cdbc05b0d3611618297a7d21017de5f10126fc7a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Nov 2023 11:02:01 +0000 Subject: [PATCH 39/76] Fixes to work with CUDA 12 toolkit --- src/cudamatrix/cu-kernels.cu | 73 ++++++++++++++----------- src/cudamatrix/cu-sparse-matrix-test.cc | 6 +- src/cudamatrix/cu-sparse-matrix.cc | 65 +++++++++++++++------- src/cudamatrix/cu-sparse-matrix.h | 21 +++---- 4 files changed, 99 insertions(+), 66 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8044ff699bc..7ffdc541113 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -953,11 +953,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, } // Warp reduce. Implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { smem.sum[tid] += smem.sum[tid + shift]; } + __syncwarp(); } // output 1 sum per thread block @@ -1206,11 +1207,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, } // Warp reduce to 1 element. Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { - ssum[tid] += ssum[tid + shift]; - } + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { + ssum[tid] += ssum[tid + shift]; + } + __syncwarp(); } // output 1 sum per thread block @@ -1257,12 +1259,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, // Warp reduce to 1 element per column. // Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { - ssum[tid] += ssum[tid + shift]; + if (tid < warpSize) { + ssum[tid] += ssum[tid + shift]; + } + __syncwarp(); } - } // output TileDim sums per thread block if (tid < TileDim) { @@ -1340,13 +1343,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M, // Warp reduce to 1 element per column. // Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift >= TileDim; shift >>= 1) { + for (int shift = warpSize; shift >= TileDim; shift >>= 1) { + if (tid < warpSize) { smem.sum[tid] += smem.sum[tid + shift]; } + __syncwarp(); } - // output TileDim sums per thread block if (tid < TileDim && j_n < dim_N.cols) { v[j_n] = alpha * smem.sum[tid] + beta * v[j_n]; @@ -1793,10 +1796,11 @@ static void _vec_transform_reduce( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); } + __syncwarp(); } // Output to vector result. @@ -2006,9 +2010,11 @@ static void _transform_reduce_mat_rows( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + } + __syncwarp(); } // Output to vector result. @@ -2045,11 +2051,13 @@ static void _transform_reduce_mat_cols( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + } + __syncwarp(); } - + // Output to vector result. if (tid == 0) { result[i] = op.PostReduce(sdata[0], result[i]); @@ -2087,13 +2095,12 @@ static void _group_transform_reduce( x_idx += threads_per_group; } sreduction[tid] = treduction; - if (threads_per_group > warpSize) { - __syncthreads(); - } + __syncthreads(); // tree-reduce to 2x warpSize elements per group # pragma unroll - for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) { + int shift = threads_per_group / 2; + for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); } @@ -2101,14 +2108,12 @@ static void _group_transform_reduce( } // Warp-reduce to 1 element per group. - // Threads implicitly synchronized within the warp. - const int warp_reduce_size = - threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize; - if (threadIdx.x < warp_reduce_size) { # pragma unroll - for (int shift = warp_reduce_size; shift > 0; shift >>= 1) { + for (; shift > 0; shift >>= 1) { + if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); } + __syncwarp(); } // Store the result. @@ -2967,12 +2972,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv, } // reduce to 1 element per row - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sprod[tid] += sprod[tid + shift]; snorm[tid] += snorm[tid + shift]; } + __syncwarp(); } // broadcast the sum results @@ -3254,15 +3260,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, } // Warp reduce without __syncthreads() // (note.: synchronizes implicitly within a warp at the multiprocessor) - if (tid < warpSize / 2) { #pragma unroll - for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; - num_working_threads >>= 1) { + for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; + num_working_threads >>= 1) { + if (tid < warpSize / 2) { if (smax[tid + num_working_threads] > smax[tid]) { smax[tid] = smax[tid + num_working_threads]; sidx[tid] = sidx[tid + num_working_threads]; } } + __syncwarp(); } if (tid == 0) { diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc index aad34b5dd54..0c2230a8731 100644 --- a/src/cudamatrix/cu-sparse-matrix-test.cc +++ b/src/cudamatrix/cu-sparse-matrix-test.cc @@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() { template static void UnitTestCuSparseMatrixTraceMatSmat() { for (int32 i = 0; i < 2; i++) { - MatrixIndexT row = 10 + Rand() % 40; - MatrixIndexT col = 10 + Rand() % 50; + MatrixIndexT row = 2 + Rand() % 3; + MatrixIndexT col = 1 + Rand() % 4; CuMatrix mat1(row, col); CuMatrix mat2(col, row); @@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() { cu_smat2.CopyToMat(&mat2); Real trace1 = TraceMatMat(mat3, mat1, kTrans); + Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans); AssertEqual(trace1, trace2, 0.00001); trace1 = TraceMatMat(mat3, mat2, kNoTrans); trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans); + AssertEqual(trace1, trace2, 0.00001); } } diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 703aa40e735..f24613fa231 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -161,7 +161,7 @@ void CuSparseMatrix::SelectRows(const CuArray &row_indexes, template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, int32 dim, MatrixTransposeType trans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -194,8 +194,8 @@ template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, const CuVectorBase &weights, int32 dim, MatrixTransposeType trans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( - NULL) { + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), + csr_val_(NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined); @@ -266,8 +266,9 @@ void CuSparseMatrix::Resize(const MatrixIndexT num_rows, num_rows_ = 0; num_cols_ = 0; nnz_ = 0; - csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( + csr_row_ptr_ = static_cast(CuDevice::Instantiate().Malloc( 1 * sizeof(int))); + csr_col_idx_ = NULL; // may be freed, but this is allowed. csr_val_ = NULL; } else { KALDI_ASSERT(num_rows > 0); @@ -277,10 +278,16 @@ void CuSparseMatrix::Resize(const MatrixIndexT num_rows, num_rows_ = num_rows; num_cols_ = num_cols; nnz_ = nnz; - csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( - (num_rows + 1 + nnz) * sizeof(int))); - csr_val_ = static_cast(CuDevice::Instantiate().Malloc( + csr_row_ptr_ = static_cast(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int))); + if (nnz > 0) { + csr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( + nnz * sizeof(int))); + csr_val_ = static_cast(CuDevice::Instantiate().Malloc( nnz * sizeof(Real))); + } else { + csr_col_idx_ = NULL; + csr_val_ = NULL; + } CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.Set(nnz); if (resize_type == kSetZero) { @@ -302,8 +309,11 @@ void CuSparseMatrix::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - if (csr_row_ptr_col_idx_) { - CuDevice::Instantiate().Free(csr_row_ptr_col_idx_); + if (csr_row_ptr_) { + CuDevice::Instantiate().Free(csr_row_ptr_); + } + if (csr_col_idx_) { + CuDevice::Instantiate().Free(csr_col_idx_); } if (csr_val_) { CuDevice::Instantiate().Free(csr_val_); @@ -311,7 +321,8 @@ void CuSparseMatrix::Destroy() { num_rows_ = 0; num_cols_ = 0; nnz_ = 0; - csr_row_ptr_col_idx_ = NULL; + csr_row_ptr_ = NULL; + csr_col_idx_ = NULL; csr_val_ = NULL; CuDevice::Instantiate().AccuProfile(__func__, tim); } else @@ -378,11 +389,17 @@ void CuSparseMatrix::CopyFromSmat(const CuSparseMatrix& smat, CuSubVector val_from(smat.CsrVal(), smat.NumElements()); val_to.CopyFromVec(val_from); - CuSubArray idx_to(csr_row_ptr_col_idx_, - NumRows() + 1 + NumElements()); - CuSubArray idx_from(smat.csr_row_ptr_col_idx_, - smat.NumRows() + 1 + smat.NumElements()); - idx_to.CopyFromArray(idx_from); + { + CuSubArray idx_to(csr_row_ptr_, NumRows() + 1); + CuSubArray idx_from(smat.csr_row_ptr_, NumRows() + 1); + idx_to.CopyFromArray(idx_from); + } + + { + CuSubArray idx_to(csr_col_idx_, NumElements()); + CuSubArray idx_from(smat.csr_col_idx_, NumElements()); + idx_to.CopyFromArray(idx_from); + } } else { Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined); @@ -413,9 +430,14 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { smat->Resize(0, 0); return; } - CuSubArray idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements()); - std::vector idx_cpu; - idx.CopyToVec(&idx_cpu); + CuSubArray row_ptr(csr_row_ptr_, NumRows() + 1); + std::vector row_ptr_cpu; + row_ptr.CopyToVec(&row_ptr_cpu); + + + CuSubArray col_idx(csr_col_idx_, NumElements()); + std::vector col_idx_cpu; + col_idx.CopyToVec(&col_idx_cpu); CuSubVector val(CsrVal(), NumElements()); Vector val_cpu(NumElements(), kUndefined); @@ -425,8 +447,8 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { NumRows()); int n = 0; for (int i = 0; i < NumRows(); ++i) { - for (; n < idx_cpu[i + 1]; ++n) { - const MatrixIndexT j = idx_cpu[NumRows() + 1 + n]; + for (; n < row_ptr_cpu[i + 1]; ++n) { + const MatrixIndexT j = col_idx_cpu[n]; pairs[i].push_back( { j, val_cpu(n) }); } } @@ -484,7 +506,8 @@ void CuSparseMatrix::Swap(CuSparseMatrix *smat) { std::swap(num_rows_, smat->num_rows_); std::swap(num_cols_, smat->num_cols_); std::swap(nnz_, smat->nnz_); - std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_); + std::swap(csr_row_ptr_, smat->csr_row_ptr_); + std::swap(csr_col_idx_, smat->csr_col_idx_); std::swap(csr_val_, smat->csr_val_); } else #endif diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h index 82b17a0dc71..180beed6183 100644 --- a/src/cudamatrix/cu-sparse-matrix.h +++ b/src/cudamatrix/cu-sparse-matrix.h @@ -121,13 +121,13 @@ class CuSparseMatrix { /// Default constructor CuSparseMatrix() : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { } /// Constructor from CPU-based sparse matrix. explicit CuSparseMatrix(const SparseMatrix &smat) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { this->CopyFromSmat(smat); } @@ -135,7 +135,7 @@ class CuSparseMatrix { /// Constructor from GPU-based sparse matrix (supports transposition). CuSparseMatrix(const CuSparseMatrix &smat, MatrixTransposeType trans = kNoTrans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { this->CopyFromSmat(smat, trans); } @@ -200,19 +200,19 @@ class CuSparseMatrix { /// indices of the first nonzero element in the i-th row, while the last entry /// contains nnz_, as zero-based CSR format is used. const int* CsrRowPtr() const { - return csr_row_ptr_col_idx_; + return csr_row_ptr_; } int* CsrRowPtr() { - return csr_row_ptr_col_idx_; + return csr_row_ptr_; } /// Returns pointer to the integer array of length nnz_ that contains /// the column indices of the corresponding elements in array CsrVal() const int* CsrColIdx() const { - return csr_row_ptr_col_idx_ + num_rows_ + 1; + return csr_col_idx_; } int* CsrColIdx() { - return csr_row_ptr_col_idx_ + num_rows_ + 1; + return csr_col_idx_; } private: @@ -238,9 +238,10 @@ class CuSparseMatrix { // number of non-zeros MatrixIndexT nnz_; - // csr row ptrs and col indices in a single int array - // of the length (num_rows_ + 1 + nnz_) - int* csr_row_ptr_col_idx_; + // length num_rows_ + 1 + int* csr_row_ptr_; + // length nnz_ + int* csr_col_idx_; // csr value array of the length nnz_ Real* csr_val_; From fe127209cc0d653cef80a5e81487a95f0405de32 Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Sat, 4 Nov 2023 18:01:27 +0100 Subject: [PATCH 40/76] Openblas repo was renamed --- tools/extras/install_openblas.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh index ce0fdf7fbdb..521d096adbd 100755 --- a/tools/extras/install_openblas.sh +++ b/tools/extras/install_openblas.sh @@ -19,18 +19,18 @@ fi tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz -rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz +rm -rf OpenMathLib-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz if [ -d "$DOWNLOAD_DIR" ]; then cp -p "$DOWNLOAD_DIR/$tarball" . else - url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') + url=$($WGET -qO- "https://api.github.com/repos/OpenMathLib/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python3 -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') test -n "$url" $WGET -t3 -nv -O $tarball "$url" fi tar xzf $tarball -mv xianyi-OpenBLAS-* OpenBLAS +mv OpenMathLib-OpenBLAS-* OpenBLAS make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install if [ $? -eq 0 ]; then From f584420d8c1448e8e70f9106aa49712f63d06347 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 6 Nov 2023 16:57:15 +0000 Subject: [PATCH 41/76] Fix tests with zero size matrices and needing syncwarp for LDS sharing. --- ...ure-online-batched-ivector-cuda-kernels.cu | 30 +++-- ...re-online-batched-spectral-cuda-kernels.cu | 4 +- src/cudafeat/feature-online-cmvn-cuda.cu | 4 +- src/cudafeat/feature-spectral-cuda.cu | 4 +- .../online-ivector-feature-cuda-kernels.cu | 26 ++-- src/cudamatrix/cu-device.cc | 4 + src/cudamatrix/cu-kernels.cu | 127 ++++++++++++++---- src/cudamatrix/cu-math-test.cc | 11 +- src/cudamatrix/cu-math.cc | 2 +- src/cudamatrix/cu-matrix-test.cc | 24 +++- src/cudamatrix/cu-matrix.cc | 12 +- src/cudamatrix/cu-sparse-matrix.cc | 6 +- src/cudamatrix/cu-vector.cc | 13 +- src/hip/hipify.h | 35 ++++- src/makefiles/hip_64bit.mk | 7 +- 15 files changed, 219 insertions(+), 90 deletions(-) diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index 0b4cfce812c..e5b89d163e5 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -50,7 +50,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols, const float *feats, int32_t ldf, int32_t stridef, float *feats_sq, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (chunk_frames + threads.y - 1) / threads.y, num_lanes); @@ -101,8 +101,10 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss, float *posteriors, int32_t ldp, int32_t stridep, int32_t right, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); - dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + num_lanes); zero_invalid_posteriors_kernel<<>>( num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes, @@ -215,8 +217,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int32_t stridest, float *spliced_feats, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - int threads = (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is 1024 threads dim3 blocks(num_chunk_frames, num_lanes); @@ -311,10 +313,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, // First we need to shift feats to handle the case where num_chunk_frames // is less than stash size - KALDI_ASSERT(stash_size <= 32); - // This only works if stash size is <= 32 as we rely on __syncthreads() + KALDI_ASSERT(stash_size <= GPU_WARP_SIZE); + // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads() // to avoid read/write hazards when reading/writing in-place - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks(num_lanes); shift_feats_kernel<<>>(chunk_size, feats, feat_dim, ldf, @@ -324,8 +326,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, { int threads = - (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(stash_size, num_lanes); // Then we need to copy feats from source into stash @@ -507,8 +509,8 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp, void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A, int32_t lda, int32_t stridea, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); - int block = (n + 31) / 32; // blocks in x and y dimensions + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions dim3 blocks(block, block, num_lanes); batched_convert_sp_to_dense_kernel<<>>( @@ -584,7 +586,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma, int32_t strideg, float *X, int32_t ldx, int32_t stridex, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t blocks = num_lanes; initialize_channels_kernel<<>>( @@ -629,7 +631,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma, int32_t ldx, int32_t stridex, float *X_stash, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t blocks = num_lanes; apply_and_update_stash_kernel<<>>( diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index f847311d755..27375f4914e 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -68,7 +68,7 @@ __global__ void batched_mel_banks_compute_kernel( // perfom local sum float sum = 0; if (frame < num_frames) { // exclude frames beyond the end - for (int idx = tid; idx < size; idx += 32) { + for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) { sum += v[idx] * w[idx]; } } @@ -487,7 +487,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes, float energy_floor, int32 *offsets, int32 *sizes, float **vecs, const float *feats, int32_t ldf, float *mels, int32_t ldm, bool use_log) { - dim3 Bl(32, 8); + dim3 Bl(GPU_WARP_SIZE, 8); dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes); batched_mel_banks_compute_kernel<<>>( lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs, diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index 1c896f1307f..f8947a3b5ed 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -188,8 +188,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase &feats_in, stats.Stride()); CU_SAFE_CALL(cudaGetLastError()); - threads = (feat_dim + 31) / 32 * 32; // round up to 32 threads - if (threads > 1024) threads = 1024; + threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; const CuMatrix &gstats = cmvn_state_.global_cmvn_stats; const CuMatrix &sstats = cmvn_state_.speaker_cmvn_stats; diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index c320c85a029..9c0d5df5288 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -134,7 +134,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor, // perfom local sum float sum = 0; - for (int idx = tid; idx < size; idx += 32) { + for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) { sum += v[idx] * w[idx]; } @@ -493,7 +493,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w // mel banks int num_bins = bin_size_; cu_mel_energies_.Resize(num_frames, num_bins, kUndefined); - dim3 mel_threads(32, 8); + dim3 mel_threads(GPU_WARP_SIZE, 8); dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y); mel_banks_compute_kernel<<>>( num_frames, std::numeric_limits::epsilon(), offsets_, sizes_, diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index 378ea18e689..dffc9fd3c8f 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -26,17 +26,17 @@ #include "cudamatrix/cu-common.h" namespace kaldi { -// Meant to be called with blockDim= 32x32 +// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK __global__ void batched_gemv_reduce_kernel(int rows, int cols, const float* __restrict__ A, int lda, const float* __restrict__ X, int ldx, float* C) { // Specialize WarpReduce for type float typedef cub::WarpReduce WarpReduce; - // Allocate WarpReduce shared memory for 32 warps - __shared__ typename WarpReduce::TempStorage temp_storage[32]; + // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps + __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; - __shared__ float s_A[32][32 + 1]; //+1 to avoid bank conflicts on transpose + __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose int bid = blockIdx.x; // batch id int tid = threadIdx.x; // thread id @@ -47,13 +47,13 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Offset to input vector to starting column for batch const float* __restrict__ X_in = X + bid * ldx; - for (int i = 0; i < cols; i += 32) { // threadIdx.x, keep all threads present + for (int i = 0; i < cols; i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present int c = i + tid; float sum = 0.0f; // Perform dot product for (int j = 0; j < rows; - j += 32) { // threadIdx.y, keep all threads present + j += GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present int r = j + wid; float val = 0.0f; @@ -139,9 +139,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows, int32_t lda, float scale, float* retval) { // Specialize WarpReduce for type float - typedef cub::BlockReduce + typedef cub::BlockReduce BlockReduce; - // Allocate WarpReduce shared memory for 32 warps + // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps __shared__ typename BlockReduce::TempStorage temp_storage; float sum = 0.0f; @@ -207,7 +207,7 @@ __global__ void update_linear_and_quadratic_terms_kernel( void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, const float* AT, int B_stride, const float* B, float* C) { - batched_gemv_reduce_kernel<<>>( + batched_gemv_reduce_kernel<<>>( rows, cols, AT, A_stride, B, B_stride, C); CU_SAFE_CALL(cudaGetLastError()); } @@ -215,8 +215,8 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left, int32_t size, const float* feats, int32_t ldf, float* sfeats, int32_t lds) { - int threads = (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads splice_features_kernel<<>>( num_frames, feat_dim, left, size, feats, ldf, sfeats, lds); @@ -238,7 +238,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames, void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols, float* A, int32_t lda, float scale, float* sum) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (num_rows + threads.y - 1) / threads.y); @@ -249,7 +249,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols, void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats, int32_t ldf, float* feats_sq, int32_t lds) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (num_rows + threads.y - 1) / threads.y); diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 3dada172ba8..25775fb1b05 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -249,8 +249,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) { return; } else { // Suggest to use compute exclusive mode + #ifdef __IS_HIP_COMPILE__ + KALDI_WARN << "Not in compute-exclusive mode."; + #else KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use " "'nvidia-smi -c 3' to set compute exclusive mode"; + #endif // We want to choose the device more carefully, so release the CUDA context. e = cudaDeviceReset(); if (e != cudaSuccess) { diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 1b0cf1f2c90..792932c18d5 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -966,6 +966,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { smem.sum[tid] += smem.sum[tid + shift]; + __syncwarp(); } } @@ -1118,8 +1119,8 @@ void trace_mat_mat_trans_atomic(Real *d_result, cudaStream_t stream) { // Assuming *d_result is set to zero already - constexpr int THREADS_X = 32; - constexpr int THREADS_Y = 16; + constexpr int THREADS_X = GPU_WARP_SIZE; + constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2; dim3 thrds(THREADS_X, THREADS_Y); @@ -1176,6 +1177,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1219,6 +1221,7 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1270,6 +1273,7 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1353,6 +1357,7 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M, # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { smem.sum[tid] += smem.sum[tid + shift]; + __syncwarp(); } } @@ -1805,6 +1810,7 @@ static void _vec_transform_reduce( if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + __syncwarp(); } } @@ -1904,7 +1910,6 @@ __global__ void _strided_reduction_fused_kernel(Real * __restrict__ dots, const int idx = colStart + (j + u*stride) * d.stride; vals[u] = op.Transform(data[idx]); } - #pragma unroll for (int u = 0; u < unroll_count; ++u) { thread_data = op.Reduce(thread_data, vals[u]); @@ -2018,6 +2023,7 @@ static void _transform_reduce_mat_rows( if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + __syncwarp(); } // Output to vector result. @@ -2042,9 +2048,27 @@ static void _transform_reduce_mat_cols( for (int j = tid; j < d.cols; j += CU1DBLOCK) { tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); } + + // if (tid == 0) { + // for (int j = 0; j < d.cols; j += 1) + // tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); + // result[i] = tdata; + + // } + // return; + sdata[tid] = tdata; __syncthreads(); + // if (tid == 0) { + // tdata = 0; + // for (int j = 0; j < CU1DBLOCK; j += 1) + // tdata = op.Reduce(tdata, op.Transform(sdata[j])); + // result[i] = tdata; + // } + + // return; + // Tree reduce # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { @@ -2053,12 +2077,30 @@ static void _transform_reduce_mat_cols( __syncthreads(); } + // if (tid == 0) { + // tdata = 0; + // for (int j = 0; j < 2*warpSize; j += 1) + // tdata = op.Reduce(tdata, op.Transform(sdata[j])); + // result[i] = tdata; + // } + + // return; + + // Reduce last warp. Threads implicitly synchronized within a warp. if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) - sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + for (int shift = warpSize; shift > 0; shift >>= 1) { + sdata[tid] += sdata[tid + shift]; + __syncwarp(); + //__syncthreads(); // Why this needed? + } } + if (tid == 0) + result[i] = sdata[0]; + + return; + // Output to vector result. if (tid == 0) { result[i] = op.PostReduce(sdata[0], result[i]); @@ -2117,6 +2159,7 @@ static void _group_transform_reduce( # pragma unroll for (int shift = warp_reduce_size; shift > 0; shift >>= 1) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); + __syncwarp(); } } @@ -2981,6 +3024,7 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv, for (int shift = warpSize; shift > 0; shift >>= 1) { sprod[tid] += sprod[tid + shift]; snorm[tid] += snorm[tid + shift]; + __syncwarp(); } } @@ -3271,6 +3315,7 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, smax[tid] = smax[tid + num_working_threads]; sidx[tid] = sidx[tid + num_working_threads]; } + __syncwarp(0xffffffffu >> (32-num_working_threads)); } } @@ -3999,7 +4044,7 @@ struct BatchedMatrixCopyDesc { MatrixCopyDesc batch[MAX_BATCH_SIZE]; }; -// launched with a block size of 32x32 (32 rows, 32 cols per CTA) +// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) // grid dim x,y expands to fill out average in x/y across batches // grid dim.z is batch template @@ -4380,7 +4425,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B, void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat<32> <<>>(A,B,dA,B_stride,value); + _trace_mat_mat <<>>(A,B,dA,B_stride,value); } void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, @@ -4401,6 +4446,11 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, } else if (Bl.x == 32) { _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); +#endif } } @@ -4409,9 +4459,13 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha, const float* N, const MatrixDim dim_N, const float beta, float* v) { if (Bl.x == 16) { - _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<16><<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { _add_diag_mat_mat_MN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x==64) { + _add_diag_mat_mat_MN<64><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#endif } } @@ -4451,6 +4505,7 @@ void cudaF_vector_copy_elements(dim3 Gr, dim3 Bl, float *data, int dim, transpose, elements); } + void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) { @@ -5086,7 +5141,7 @@ void cudaD_trace_mat_mat_trans(const double* A, void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat<32> <<>>(A,B,dA,B_stride,value); + _trace_mat_mat <<>>(A,B,dA,B_stride,value); } void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha, @@ -5107,6 +5162,11 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, } else if (Bl.x == 32) { _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); +#endif } } @@ -5115,9 +5175,13 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha, const double* N, const MatrixDim dim_N, const double beta, double* v) { if (Bl.x == 16) { - _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<16><<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { _add_diag_mat_mat_MN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x==64) { + _add_diag_mat_mat_MN<64><<>>(alpha,M,stride_M,N,dim_N,beta,v); +#endif } } @@ -5488,25 +5552,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim, @@ -5802,7 +5866,14 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, // Launches a kernel that does nothing, explicitly using the legacy default stream; // this will synchronize all threads without blocking. void cuda_legacy_noop() { +#ifdef __IS_HIP_COMPILE__ + // HIP doesn't currently support cudaStreamLegacy stream so we force to use the + // non-per-thread API to get similar semantics. + auto k = reinterpret_cast(_noop_kernel); + hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0); +#else _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>(); +#endif } void cudaF_mat_copy_range_clamped( @@ -5812,8 +5883,8 @@ void cudaF_mat_copy_range_clamped( float *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(32,32); - dim3 blocks((num_cols+31)/32,(num_rows+31)/32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5826,8 +5897,8 @@ void cudaD_mat_copy_range_clamped( double *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(32,32); - dim3 blocks((num_cols+31)/32,(num_rows+31)/32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5837,7 +5908,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { - dim3 threads(32,32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5863,8 +5934,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly @@ -5886,8 +5957,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, remaining); // no memcpy needed here. Memory will be passed down directly @@ -5902,7 +5973,7 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { - dim3 threads(32,32); + dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5928,8 +5999,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly @@ -5951,8 +6022,8 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, + dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, remaining); // no memcpy needed here. Memory will be passed down directly diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index e1d59e777be..1245fb28bad 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() { for (int32 loop = 0; loop < 10; loop++) { // problem dimensions. - int32 num_rows = RandInt(5, 20), - cell_dim = RandInt(2, 200), - dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); + int32 num_rows = RandInt(5, 20), //16 + cell_dim = RandInt(2, 200), //45 + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3 // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -232,7 +232,6 @@ void UnitTestLstmNonlinearity() { else test_params = -1; - CuMatrix input(num_rows, cell_dim * 5 + dropout_dim), params(3, cell_dim), output_deriv(num_rows, cell_dim * 2); @@ -277,11 +276,11 @@ void UnitTestLstmNonlinearity() { for (int32 i = 0; i < test_dim; i++) { CuMatrix delta_input(num_rows, 5 * cell_dim + dropout_dim), delta_params(3, cell_dim); - if (test_input >= 0) { + if (test_input >= 0) { // -1 delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn(); delta_input.Scale(delta); } - if (test_params >= 0) { + if (test_params >= 0) { // 0 delta_params.Row(test_params).SetRandn(); delta_params.Scale(delta); } diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 3fbeff3a470..d0d8e4e771f 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x), // n_blocks(num_rows, dimBlock.y)); diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index be8483e48f5..26a5281ec05 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2675,10 +2675,18 @@ static void UnitTestCuMatrixSetRandn() { template static void UnitTestCuMatrixSetRandUniform() { + + // if (CuDevice::Instantiate().Enabled()) { + // CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456)); + // } + for (int32 i = 0; i < 2; i++) { - MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200; + MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200; CuMatrix M(rows, cols); M.SetRandUniform(); + // M.SetZero(); + // M.Add(0.5); + // M.SetZeroAboveDiag(); M.Add(-0.5); // we'll be testing the central moments, so // center it around zero first. @@ -2693,6 +2701,16 @@ static void UnitTestCuMatrixSetRandUniform() { for (int32 pow = 1; pow < central_moments.Dim(); pow++) { CuMatrix Mpow(M); Mpow.ApplyPow(pow); + + // if (CuDevice::Instantiate().Enabled()) { + // CuVector col_sum(rows, kUndefined); + // cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim()); + // KALDI_LOG << "Sums vector is " << col_sum; + // Real ans = col_sum.Sum(); + // KALDI_LOG << "Total sum is " << ans; + // KALDI_ERR << "Stopping!"; + // } + Real observed_moment = Mpow.Sum() / (rows * cols); // see http://en.wikipedia.org/wiki/Normal_distribution#Moments, // note that mu = 0 and sigma = 1. @@ -2705,10 +2723,12 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; + //KALDI_LOG << "Random vector sum is " << col_sum; KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } + KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")"; } } } @@ -3061,7 +3081,7 @@ template void CudaMatrixUnitTest() { int main() { SetVerboseLevel(1); int32 loop = 0; - bool test_threads = true; + bool test_threads = false; // num_threads only matters if test_threads == true. Don't make it // to large, because it will affect CPU usage if you are using CPU. int32 num_threads = 4; diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 9897917a33f..56acf340823 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -253,7 +253,7 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, } else { // 2D thread block with warps (blockDim.x) along the row-dim of input M. // Each (8x32) thread block will transpose (32x32) data - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(M.NumCols(), warpSize), n_blocks(M.NumRows(), warpSize)); @@ -859,7 +859,7 @@ void CuMatrixBase::DiffGroupPnorm(const CuMatrixBase &in_value, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); dim3 dimGrid(n_blocks(NumCols(), dimBlock.x), n_blocks(NumRows(), dimBlock.y)); @@ -1009,7 +1009,7 @@ void CuMatrixBase::AddSmat(Real alpha, const CuSparseMatrix &A, // We use warpSize threads per row to access only the nonzero elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of A. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y)); @@ -2186,7 +2186,7 @@ Real TraceMatMat(const CuMatrixBase &A, // if the matrix is not in a very bad shape. // (wider or taller than 32x8192) // CPU will then reduce to 1 element. - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize), n_blocks(A.NumRows(), kWarpSize)); @@ -2408,7 +2408,7 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { // and use transposed copy to fill *this // see CuMatrixBase::CopyFromMat() for more detail of the impl MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ }; - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(rv_dim.cols, warpSize), n_blocks(rv_dim.rows, warpSize)); @@ -2418,7 +2418,7 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { } else if (rv.Dim() == num_rows_) { // use 2D block (8x32) and large enough grid to cover matrix *this // dimBlock.x need to be at least warpSize for coalesced memory access. - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(num_cols_, dimBlock.x), n_blocks(num_rows_, dimBlock.y)); diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 93d10099466..1a82ce0d4df 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -148,7 +148,7 @@ void CuSparseMatrix::SelectRows(const CuArray &row_indexes, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all selected rows. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y)); @@ -558,7 +558,7 @@ Real TraceMatSmat(const CuMatrixBase &A, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of B. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y)); @@ -648,7 +648,7 @@ void CuSparseMatrix::CopyToMat(CuMatrixBase *M, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(NumRows(), dimBlock.y)); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 1deb1cb8733..f6426297e49 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -639,7 +639,10 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, N.Data(), N.Stride(), beta, data_); } else { // Case 2: diag(M'*N) == sum(M.*N, 1) - // 16x16 or 8x32 2D block for coalesced memory access. + // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2 + // or + // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE + // 2D block for coalesced memory access. // Grid shape is designed as follows, // 1. for small matrices, use 1D grid with only 1 row of 16x16 block, // to avoid multiple kernel launch; @@ -647,11 +650,11 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // use 1- or 2-D grid so that the grid contains // at least and not much larger than 'kOptNumBlocks' blocks // to fully utilize the GPU; - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; const int32 kOptNumBlocks = 512; const int32 tile_dim = (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? - 16 : 32; + GPU_WARP_SIZE/2 : GPU_WARP_SIZE; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), n_blocks(N.NumRows(), dimBlock.y)); @@ -678,7 +681,7 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // One block per 'tile_dim' columns of N. // 1D grid expands along the row of N. int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; + sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), @@ -687,7 +690,7 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // Case 4: diag(M'*N') == sum(N'.*M, 1) // Same kernel and config as case 3 except M and N are swapped. int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; + sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(M.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(), diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 723b5b1f059..56d7e869a32 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -2,7 +2,19 @@ #define __HIPIFY_H__ #ifdef __HIPCC__ -inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} +inline __device__ void __syncwarp(unsigned mask=0xffffffff) { + // On CDNA hardware wave-fronts (warps) execute always in + // lock step. Though it might still be important to signal + // that the compiler can't reorder code around certain code + // sections that rely on data sharing mecanisms like LDS + // (shared memory). So this implements a No-op but is seen + // by the compiler as having side effects. + __asm__("s_nop 0"); + + // A saffest option, arguably less performant would be to use: + // __asm__("s_waitcnt lgkmcnt(0)"); Í + // to explicitly do a memory fence. +} // AMDGCN only support this rounding mode. #define __fdiv_rd __fdiv_rn #else @@ -153,7 +165,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaMallocHost hipHostMalloc #define cudaMallocPitch hipMallocPitch #define cudaMemcpy hipMemcpy -#define cudaMemcpy2DAsync hipMemcpy2DAsync +// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized +// copies, which should be canceled by ROCm 5.7.1+. Then the following would +// be sufficient: +// #define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \ + [&]() -> hipError_t { \ + if (width && height) \ + return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \ + return hipSuccess; \ + }() #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost @@ -166,8 +187,7 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} #define cudaStreamCreate hipStreamCreate #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamDestroy hipStreamDestroy -#define cudaStreamLegacy ((hipStream_t)1) -#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamPerThread ((hipStream_t)2) #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent @@ -243,6 +263,13 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) {} // #define cub hipcub +// +// Callback qualifier +// +#define CUDART_CB +#define GPU_WARP_SIZE 64 +#define GPU_MAX_THREADS_PER_BLOCK 1024 +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE) #endif //__HIPIFY_H__ diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk index 8d85872aa9b..aec3e359f53 100644 --- a/src/makefiles/hip_64bit.mk +++ b/src/makefiles/hip_64bit.mk @@ -37,11 +37,14 @@ ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \ -D__IS_HIP_COMPILE__=1 \ -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \ -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \ - -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics \ + $(EXTRA_ROCM_FLAGS) + # TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles. # We allow the libraries we link against to have undefined symbols so as this can be build in # systems with no development version of these libraries (e.g. ncurses). CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined -LDLIBS += -Wl,--allow-shlib-undefined +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) From ba4e18fcb2987b7172057aa5fc2613a9e1c1f2f8 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 6 Nov 2023 17:14:29 +0000 Subject: [PATCH 42/76] Move misplaced #pragma unroll. --- src/cudamatrix/cu-kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index ac532790b86..349b21b6591 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -2135,8 +2135,8 @@ static void _group_transform_reduce( __syncthreads(); // tree-reduce to 2x warpSize elements per group -# pragma unroll int shift = threads_per_group / 2; +# pragma unroll for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); From dac0b272cfff3fba9be4b3cfdd2767271e0d4760 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Mon, 6 Nov 2023 23:46:48 +0000 Subject: [PATCH 43/76] Working version trimmed of legacy ROCm < 5.2 code. --- .gitignore | 4 - src/chain/Makefile | 12 -- src/configure | 22 +-- src/cudadecoder/Makefile | 12 -- src/cudadecoder/cuda-decoder.cc | 2 +- src/cudafeat/Makefile | 12 -- .../feature-online-batched-ivector-cuda.cc | 38 ----- .../feature-online-batched-spectral-cuda.h | 4 - src/cudafeat/feature-online-cmvn-cuda.cu | 1 + src/cudafeat/feature-spectral-cuda.h | 4 - src/cudafeat/online-ivector-feature-cuda.cc | 20 +-- src/cudamatrix/Makefile | 12 -- src/cudamatrix/cu-allocator.cc | 4 - src/cudamatrix/cu-allocator.h | 4 - src/cudamatrix/cu-block-matrix.cc | 4 - src/cudamatrix/cu-common.h | 5 - src/cudamatrix/cu-compressed-matrix.cc | 4 - src/cudamatrix/cu-device.cc | 5 +- src/cudamatrix/cu-device.h | 9 -- src/cudamatrix/cu-kernels.cu | 33 +--- src/cudamatrix/cu-math-test.cc | 11 +- src/cudamatrix/cu-matrix-test.cc | 30 +--- src/cudamatrix/cu-matrix.cc | 4 - src/cudamatrix/cu-packed-matrix.cc | 4 - src/cudamatrix/cu-sp-matrix.cc | 4 - src/cudamatrix/cu-sparse-matrix.cc | 4 - src/cudamatrix/cu-tp-matrix.cc | 4 - src/cudamatrix/cu-vector.cc | 4 - src/hip/hipify.h | 12 -- src/hip/math_constants.h | 152 ------------------ src/makefiles/hip_64bit.mk | 3 + 31 files changed, 29 insertions(+), 414 deletions(-) delete mode 100644 src/hip/math_constants.h diff --git a/.gitignore b/.gitignore index 53a4079d9ef..9f8c727d4d0 100644 --- a/.gitignore +++ b/.gitignore @@ -90,7 +90,3 @@ venv/ # CMakeLists.txt files are currently autogenerated, must not be committed. /src/**/CMakeLists.txt /build* - -# Eclipse sync project -.ptp-sync -.ptp-sync-folder diff --git a/src/chain/Makefile b/src/chain/Makefile index 5b177981ad8..dbe6c38709f 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -33,21 +33,9 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif -endif include ../makefiles/default_rules.mk diff --git a/src/configure b/src/configure index 5f9c48a6cde..37a75a5cade 100755 --- a/src/configure +++ b/src/configure @@ -295,12 +295,11 @@ function configure_rocm { ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2) echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk - # Enable HIP implementation for CXX compile commands. ROCm 5.2.0 onwards use - # __HIP_PLATFORM_AMD__ others __HIP_PLATFORM_HCC__ - if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then - echo "CXXFLAGS += -D__HIP_PLATFORM_AMD__=1" >> kaldi.mk - else - echo "CXXFLAGS += -D__HIP_PLATFORM_HCC__=1" >> kaldi.mk + # Only ROCm 5.2+ is supported. + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then + echo "\ +WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required." + exit 1; fi # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, @@ -309,17 +308,10 @@ function configure_rocm { cat makefiles/hip_64bit.mk >> kaldi.mk else echo "\ -WARNING: ROCM will not be used! - ROCM is only supported with 64-bit Linux builds." +WARNING: ROCm will not be used! + ROCm is only supported with 64-bit Linux builds." exit 1; fi - - if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -ge 2 ] || [ $ROCM_MAJOR_VERSION -gt 5 ] ; then - echo "ROCM_FLAGS += -fgpu-default-stream=per-thread" >> kaldi.mk - echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = false" >> kaldi.mk - else - echo "ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION = true" >> kaldi.mk - fi } diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index d4eda345564..a7972f1831d 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -41,21 +41,9 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) endif -endif else all: diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 06dceae73a5..9baa274e2ea 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -199,7 +199,7 @@ void CudaDecoder::AllocateHostData() { (void**)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_infotoken_concat_, + (void**)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_, diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index c0f54a854e8..d7739dae623 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -44,21 +44,9 @@ ifeq ($(CUDA), true) $(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) endif -endif else all: $(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]") diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 68c247b43e9..1699f8c1e77 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -382,43 +382,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( #if CUDA_VERSION >= 9010 int nrhs = 1; - -#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) - // query temp buffer size - int L_work; - - // perform factorization in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched_bufferSize( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, - ivector_dim_, &L_work, num_lanes)); - // allocate temp buffer - float *workspace = static_cast( - CuDevice::Instantiate().Malloc(L_work * sizeof(float))); - - // perform factorization in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrfBatched( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, - ivector_dim_, workspace, L_work, d_infoArray_, num_lanes)); - - int L_work2; - - // perform factorization in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched_bufferSize( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, - quad_array_, ivector_dim_, ivec_array_, ivector_dim_, &L_work2, num_lanes)); - // allocate temp buffer - float *workspace2 = static_cast( - CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); - - // solve for rhs in batched - CUSOLVER_SAFE_CALL(hipsolverSpotrsBatched( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, - quad_array_, ivector_dim_, ivec_array_, ivector_dim_, workspace2, L_work2, d_infoArray_, - num_lanes)); - - CuDevice::Instantiate().Free(workspace); - CuDevice::Instantiate().Free(workspace2); -#else // perform factorization in batched CUSOLVER_SAFE_CALL(cusolverDnSpotrfBatched( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, quad_array_, @@ -429,7 +392,6 @@ void BatchedIvectorExtractorCuda::ComputeIvectorsFromStats( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, ivector_dim_, nrhs, quad_array_, ivector_dim_, ivec_array_, ivector_dim_, d_infoArray_, num_lanes)); -#endif #endif // cusolver solves in place. Ivectors are now in linear_ diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index 202232c6b23..113657ce317 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -20,11 +20,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index f8947a3b5ed..bb78028118f 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -27,6 +27,7 @@ #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-vector.h" +// HIP builds do not required packed floating point operators definition. #ifndef __IS_HIP_COMPILE__ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { float2 retval; diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 66f0dce395a..5625592a717 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -20,11 +20,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index 56dbac93165..fa0e9f68237 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -299,14 +299,13 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( // Forming new non-SP matrix for cusolver. CuMatrix A(quadratic); - - #ifdef CHOLESKY // query temp buffer size int L_work; CUSOLVER_SAFE_CALL( cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), A.Data(), A.Stride(), &L_work)); + // allocate temp buffer float *workspace = static_cast( CuDevice::Instantiate().Malloc(L_work * sizeof(float))); @@ -317,26 +316,9 @@ void IvectorExtractorFastCuda::ComputeIvectorFromStats( A.Stride(), workspace, L_work, d_info_)); // solve for rhs -#if defined(__IS_HIP_COMPILE__) && (ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2) - // query temp buffer size - int L_work2; - CUSOLVER_SAFE_CALL( - hipsolverSpotrs_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, - A.Data(), A.Stride(), ivector->Data(), ivector_dim_, &L_work2)); - // allocate temp buffer - float *workspace2 = static_cast( - CuDevice::Instantiate().Malloc(L_work2 * sizeof(float))); - - CUSOLVER_SAFE_CALL(hipsolverSpotrs( - GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, - A.Data(), A.Stride(), ivector->Data(), ivector_dim_, workspace2, L_work2, d_info_)); - - CuDevice::Instantiate().Free(workspace2); -#else CUSOLVER_SAFE_CALL(cusolverDnSpotrs( GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs, A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_)); -#endif CuDevice::Instantiate().Free(workspace); #else diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 3c1100753e5..45c10b78899 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -34,20 +34,8 @@ ifeq ($(CUDA), true) endif ifeq ($(ROCM), true) -ifeq ($(ROCM_NEEDS_KERNEL_LAUNCH_HIPIFICATION), true) -.PRECIOUS: %.hip -%.hip : %.cu - LA='[^\(,]+\([^\)]+\)|[^,]+' ; \ - cat $< | \ - sed -r "s#<<<($$LA),($$LA)>>>#<<<\1,\2,0,hipStreamPerThread>>>#g" | \ - sed -r "s#<<<($$LA),($$LA),($$LA)>>>#<<<\1,\2,\3,hipStreamPerThread>>>#g" | \ - cat > $@ -%.o : %.hip - $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -else %.o : %.cu $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ endif -endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index d81dca002ce..abd08a9b015 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -25,11 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index f776bbb620e..1ed7e54b541 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,11 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 7983cd250e7..fd17fe61893 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index c4bdf569d3c..41ef7536a7f 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,13 +32,8 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#include -#else #include #include -#endif #include #include #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index 442d2dbac67..e42c93f1b67 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 25775fb1b05..4d0be20ddc3 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -21,13 +21,10 @@ // limitations under the License. + #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include #include #include "hipify.h" diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 67b9f1d9e9b..bb1170314c4 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,13 +29,8 @@ #include #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#include -#else #include #include -#endif #include #include #include @@ -49,11 +44,7 @@ #endif #if CUDA_VERSION >= 9010 #ifdef __IS_HIP_COMPILE__ -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #else #include #endif diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 349b21b6591..3d7fae5c15e 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -27,15 +27,18 @@ #include #include -#include #ifdef __IS_HIP_COMPILE__ #define __CUDA_ARCH__ 800 +#include #include #include "hipify.h" +#define CUDART_INF HIP_INF +#define CUDART_INF_F HIP_INF_F #include "cudamatrix/cu-kernels-ansi.h" #include #include #else +#include #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION @@ -2048,27 +2051,9 @@ static void _transform_reduce_mat_cols( for (int j = tid; j < d.cols; j += CU1DBLOCK) { tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); } - - // if (tid == 0) { - // for (int j = 0; j < d.cols; j += 1) - // tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); - // result[i] = tdata; - - // } - // return; - sdata[tid] = tdata; __syncthreads(); - // if (tid == 0) { - // tdata = 0; - // for (int j = 0; j < CU1DBLOCK; j += 1) - // tdata = op.Reduce(tdata, op.Transform(sdata[j])); - // result[i] = tdata; - // } - - // return; - // Tree reduce # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { @@ -2077,16 +2062,6 @@ static void _transform_reduce_mat_cols( __syncthreads(); } - // if (tid == 0) { - // tdata = 0; - // for (int j = 0; j < 2*warpSize; j += 1) - // tdata = op.Reduce(tdata, op.Transform(sdata[j])); - // result[i] = tdata; - // } - - // return; - - // Reduce last warp. Threads implicitly synchronized within a warp. for (int shift = warpSize; shift > 0; shift >>= 1) { if (tid < warpSize) { diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index 1245fb28bad..e1d59e777be 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -214,9 +214,9 @@ void UnitTestLstmNonlinearity() { for (int32 loop = 0; loop < 10; loop++) { // problem dimensions. - int32 num_rows = RandInt(5, 20), //16 - cell_dim = RandInt(2, 200), //45 - dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); //3 + int32 num_rows = RandInt(5, 20), + cell_dim = RandInt(2, 200), + dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3); // Pick the (input or params block), and output block, for which we'll // spot-check the derivative values. This will give us test failures @@ -232,6 +232,7 @@ void UnitTestLstmNonlinearity() { else test_params = -1; + CuMatrix input(num_rows, cell_dim * 5 + dropout_dim), params(3, cell_dim), output_deriv(num_rows, cell_dim * 2); @@ -276,11 +277,11 @@ void UnitTestLstmNonlinearity() { for (int32 i = 0; i < test_dim; i++) { CuMatrix delta_input(num_rows, 5 * cell_dim + dropout_dim), delta_params(3, cell_dim); - if (test_input >= 0) { // -1 + if (test_input >= 0) { delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn(); delta_input.Scale(delta); } - if (test_params >= 0) { // 0 + if (test_params >= 0) { delta_params.Row(test_params).SetRandn(); delta_params.Scale(delta); } diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 26a5281ec05..ecddd24db19 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2675,19 +2675,11 @@ static void UnitTestCuMatrixSetRandn() { template static void UnitTestCuMatrixSetRandUniform() { - - // if (CuDevice::Instantiate().Enabled()) { - // CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(GetCurandHandle(), 123456)); - // } - for (int32 i = 0; i < 2; i++) { - MatrixIndexT rows = 180+Rand() % 200, cols = 200+Rand() % 200; + MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200; CuMatrix M(rows, cols); M.SetRandUniform(); - // M.SetZero(); - // M.Add(0.5); - // M.SetZeroAboveDiag(); - + M.Add(-0.5); // we'll be testing the central moments, so // center it around zero first. // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html @@ -2701,16 +2693,6 @@ static void UnitTestCuMatrixSetRandUniform() { for (int32 pow = 1; pow < central_moments.Dim(); pow++) { CuMatrix Mpow(M); Mpow.ApplyPow(pow); - - // if (CuDevice::Instantiate().Enabled()) { - // CuVector col_sum(rows, kUndefined); - // cuda_sum_mat_cols(rows, CU1DBLOCK, col_sum.Data(), Mpow.Data(), Mpow.Dim()); - // KALDI_LOG << "Sums vector is " << col_sum; - // Real ans = col_sum.Sum(); - // KALDI_LOG << "Total sum is " << ans; - // KALDI_ERR << "Stopping!"; - // } - Real observed_moment = Mpow.Sum() / (rows * cols); // see http://en.wikipedia.org/wiki/Normal_distribution#Moments, // note that mu = 0 and sigma = 1. @@ -2723,13 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; - //KALDI_LOG << "Random vector sum is " << col_sum; - KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment + KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } - KALDI_LOG << "Moment[" << pow << "] is " << observed_moment << " (" << expected_moment << ")"; - } + } } } @@ -3081,7 +3061,7 @@ template void CudaMatrixUnitTest() { int main() { SetVerboseLevel(1); int32 loop = 0; - bool test_threads = false; + bool test_threads = true; // num_threads only matters if test_threads == true. Don't make it // to large, because it will affect CPU usage if you are using CPU. int32 num_threads = 4; diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 56acf340823..fd31758f0e6 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -29,11 +29,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 4de0fcba63d..c9d686d0ce8 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -23,11 +23,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index 86a3cd9a726..a6c7d7720e4 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 35ba3ee0c81..cda575b1914 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -24,11 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 739bab3dd59..378cc8e4e38 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -21,11 +21,7 @@ #if HAVE_CUDA==1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index f6426297e49..c88b3ebf50c 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -24,11 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include -#if ROCM_MAJOR_VERSION < 5 || ROCM_MINOR_VERSION < 2 -#include -#else #include -#endif #include "hipify.h" #else #include diff --git a/src/hip/hipify.h b/src/hip/hipify.h index 56d7e869a32..efe4848c009 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -211,7 +211,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) { #define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed #define curandSetStream hiprandSetStream #define curandStatus_t hiprandStatus_t -#if ROCM_MAJOR_VERSION == 5 && ROCM_MINOR_VERSION >= 1 || ROCM_MAJOR_VERSION > 5 #define cusolverDnCreate hipsolverDnCreate #define cusolverDnDestroy hipsolverDnDestroy #define cusolverDnHandle_t hipsolverDnHandle_t @@ -221,17 +220,6 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) { #define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize #define cusolverDnSpotrs hipsolverDnSpotrs #define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched -#else -#define cusolverDnCreate hipsolverCreate -#define cusolverDnDestroy hipsolverDestroy -#define cusolverDnHandle_t hipsolverHandle_t -#define cusolverDnSetStream hipsolverSetStream -#define cusolverDnSpotrf hipsolverSpotrf -#define cusolverDnSpotrfBatched hipsolverSpotrfBatched -#define cusolverDnSpotrf_bufferSize hipsolverSpotrf_bufferSize -#define cusolverDnSpotrs hipsolverSpotrs -#define cusolverDnSpotrsBatched hipsolverSpotrsBatched -#endif #define cusparseAction_t hipsparseAction_t #define cusparseCreate hipsparseCreate #define cusparseCreateCsr hipsparseCreateCsr diff --git a/src/hip/math_constants.h b/src/hip/math_constants.h deleted file mode 100644 index 7fb8fce8e71..00000000000 --- a/src/hip/math_constants.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__MATH_CONSTANTS_H__) -#define __MATH_CONSTANTS_H__ - -/* single precision constants */ -#define CUDART_INF_F __int_as_float(0x7f800000) -#define CUDART_NAN_F __int_as_float(0x7fffffff) -#define CUDART_MIN_DENORM_F __int_as_float(0x00000001) -#define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffff) -#define CUDART_NEG_ZERO_F __int_as_float(0x80000000) -#define CUDART_ZERO_F 0.0f -#define CUDART_ONE_F 1.0f -#define CUDART_SQRT_HALF_F 0.707106781f -#define CUDART_SQRT_HALF_HI_F 0.707106781f -#define CUDART_SQRT_HALF_LO_F 1.210161749e-08f -#define CUDART_SQRT_TWO_F 1.414213562f -#define CUDART_THIRD_F 0.333333333f -#define CUDART_PIO4_F 0.785398163f -#define CUDART_PIO2_F 1.570796327f -#define CUDART_3PIO4_F 2.356194490f -#define CUDART_2_OVER_PI_F 0.636619772f -#define CUDART_SQRT_2_OVER_PI_F 0.797884561f -#define CUDART_PI_F 3.141592654f -#define CUDART_L2E_F 1.442695041f -#define CUDART_L2T_F 3.321928094f -#define CUDART_LG2_F 0.301029996f -#define CUDART_LGE_F 0.434294482f -#define CUDART_LN2_F 0.693147181f -#define CUDART_LNT_F 2.302585093f -#define CUDART_LNPI_F 1.144729886f -#define CUDART_TWO_TO_M126_F 1.175494351e-38f -#define CUDART_TWO_TO_126_F 8.507059173e37f -#define CUDART_NORM_HUGE_F 3.402823466e38f -#define CUDART_TWO_TO_23_F 8388608.0f -#define CUDART_TWO_TO_24_F 16777216.0f -#define CUDART_TWO_TO_31_F 2147483648.0f -#define CUDART_TWO_TO_32_F 4294967296.0f -#define CUDART_REMQUO_BITS_F 3 -#define CUDART_REMQUO_MASK_F (~((~0)< Date: Mon, 6 Nov 2023 23:59:05 +0000 Subject: [PATCH 44/76] Fix formating to Google style. --- .../batched-static-nnet3-kernels.h | 1 + ...hed-threaded-nnet3-cuda-online-pipeline.cc | 1 + .../batched-threaded-nnet3-cuda-pipeline.cc | 1 + .../batched-threaded-nnet3-cuda-pipeline2.cc | 1 + src/cudadecoder/cuda-decoder-kernels.cu | 3 +- src/cudadecoder/cuda-decoder.cc | 22 +- src/cudadecoder/cuda-decoder.h | 1 + src/cudadecoder/cuda-fst.cc | 1 + .../batched-wav-nnet3-cuda-online.cc | 2 +- src/cudadecoderbin/batched-wav-nnet3-cuda.cc | 2 +- src/cudadecoderbin/batched-wav-nnet3-cuda2.cc | 1 + ...eature-online-batched-cmvn-cuda-kernels.cu | 1 + ...ure-online-batched-ivector-cuda-kernels.cu | 27 +- ...re-online-batched-spectral-cuda-kernels.cu | 4 +- .../feature-online-batched-spectral-cuda.h | 1 + src/cudafeat/feature-online-cmvn-cuda.cu | 4 +- src/cudafeat/feature-spectral-cuda.cu | 2 + src/cudafeat/feature-spectral-cuda.h | 1 + src/cudafeat/feature-window-cuda.cu | 1 + .../online-batched-feature-pipeline-cuda.cc | 4 +- .../online-ivector-feature-cuda-kernels.cu | 28 +- src/cudafeat/online-ivector-feature-cuda.cc | 1 + src/cudamatrix/cu-allocator.cc | 2 +- src/cudamatrix/cu-allocator.h | 3 +- src/cudamatrix/cu-array-inl.h | 1 + src/cudamatrix/cu-array.cc | 1 + src/cudamatrix/cu-block-matrix.cc | 1 + src/cudamatrix/cu-common.cc | 121 +++-- src/cudamatrix/cu-common.h | 5 +- src/cudamatrix/cu-compressed-matrix.cc | 1 + src/cudamatrix/cu-device.cc | 11 +- src/cudamatrix/cu-device.h | 5 +- src/cudamatrix/cu-kernels.cu | 107 ++-- src/cudamatrix/cu-matrix-test.cc | 6 +- src/cudamatrix/cu-matrix.cc | 1 + src/cudamatrix/cu-packed-matrix.cc | 1 + src/cudamatrix/cu-sp-matrix.cc | 1 + src/cudamatrix/cu-sparse-matrix.cc | 1 + src/cudamatrix/cu-tp-matrix.cc | 1 + src/cudamatrix/cu-vector.cc | 16 +- src/cudamatrix/cublas-wrappers.h | 13 +- src/hip/hipify.h | 488 +++++++++--------- 42 files changed, 512 insertions(+), 384 deletions(-) diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h index 0bcb1997576..fec2470a9db 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.h +++ b/src/cudadecoder/batched-static-nnet3-kernels.h @@ -19,6 +19,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc index c7012b686e0..ed0c0a2f5e9 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc @@ -23,6 +23,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc index d5cf7dae2d7..23d0ca283a2 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc @@ -28,6 +28,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc index f6a3455db01..01d6b1165e7 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index 6a14371911d..8503182c1f8 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -16,8 +16,9 @@ // limitations under the License. #ifdef __IS_HIP_COMPILE__ -#include "float.h" #include + +#include "float.h" #include "hipify.h" #else #include diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 9baa274e2ea..056d563a791 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -40,6 +40,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include @@ -190,35 +191,36 @@ void CudaDecoder::AllocateDeviceData() { void CudaDecoder::AllocateHostData() { channel_to_compute_.resize(nlanes_); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_extra_and_acoustic_cost_concat_, + (void **)&h_extra_and_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_acoustic_cost_concat_, + (void **)&h_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_extra_prev_tokens_concat_, + (void **)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_infotoken_concat_, + (void **)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( - cudaMallocHost((void**)&h_extra_and_acoustic_cost_concat_tmp_, + cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_acoustic_cost_concat_tmp_, + (void **)&h_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_extra_prev_tokens_concat_tmp_, + (void **)&h_extra_prev_tokens_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_infotoken_concat_tmp_, + (void **)&h_infotoken_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_))); h_lanes_counters_.Resize( nlanes_ + 1, 1); // +1 because we sometimes need last+1 value (for offsets) - KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - (void**)&h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); + KALDI_DECODER_CUDA_API_CHECK_ERROR( + cudaMallocHost((void **)&h_channels_counters_, + nchannels_ * sizeof(*h_channels_counters_))); h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_); h_all_tokens_acoustic_cost_.resize(nchannels_); diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h index 510904aa004..f6ee37512e2 100644 --- a/src/cudadecoder/cuda-decoder.h +++ b/src/cudadecoder/cuda-decoder.h @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc index 3af37eb7676..682485f6ce4 100644 --- a/src/cudadecoder/cuda-fst.cc +++ b/src/cudadecoder/cuda-fst.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc index 56368853df2..2bc0a483a0f 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc @@ -25,8 +25,8 @@ #ifdef __IS_HIP_COMPILE__ #include "hip/hip_runtime.h" -#include "roctracer/roctx.h" #include "hipify.h" +#include "roctracer/roctx.h" #else #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc index 05af50d7a3b..0e4a719bc75 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc @@ -19,8 +19,8 @@ #ifdef __IS_HIP_COMPILE__ #include "hip/hip_runtime.h" -#include "roctracer/roctx.h" #include "hipify.h" +#include "roctracer/roctx.h" #else #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc index c14571f2ed9..b2ad9254c67 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc @@ -22,6 +22,7 @@ #include #include #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu index 7a521d43693..1df9c6a7a43 100644 --- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu @@ -17,6 +17,7 @@ // #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index e5b89d163e5..da2ba24bd90 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -18,6 +18,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include @@ -102,8 +103,9 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss, int32_t right, const LaneDesc *lanes, int32_t num_lanes) { dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); - dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / + GPU_MAX_WARPS_PER_BLOCK, num_lanes); zero_invalid_posteriors_kernel<<>>( @@ -217,8 +219,10 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int32_t stridest, float *spliced_feats, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size - if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(num_chunk_frames, num_lanes); @@ -314,8 +318,8 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, // is less than stash size KALDI_ASSERT(stash_size <= GPU_WARP_SIZE); - // This only works if stash size is <= GPU_WARP_SIZE as we rely on __syncthreads() - // to avoid read/write hazards when reading/writing in-place + // This only works if stash size is <= GPU_WARP_SIZE as we rely on + // __syncthreads() to avoid read/write hazards when reading/writing in-place dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks(num_lanes); @@ -325,9 +329,11 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, } { - int threads = - (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size - if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(stash_size, num_lanes); // Then we need to copy feats from source into stash @@ -510,7 +516,8 @@ void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A, int32_t lda, int32_t stridea, const LaneDesc *lanes, int32_t num_lanes) { dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); - int block = (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions + int block = + (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions dim3 blocks(block, block, num_lanes); batched_convert_sp_to_dense_kernel<<>>( diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index 27375f4914e..856d2acab81 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -18,8 +18,10 @@ #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h" #ifdef __IS_HIP_COMPILE__ -#include #include + +#include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index 113657ce317..d18f5237e8f 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -21,6 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index bb78028118f..e432fe56573 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -18,6 +18,7 @@ #ifdef __IS_HIP_COMPILE__ #define __CUDA_ARCH__ 800 #include + #include "hipify.h" #else #include @@ -189,7 +190,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase &feats_in, stats.Stride()); CU_SAFE_CALL(cudaGetLastError()); - threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads + threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; const CuMatrix &gstats = cmvn_state_.global_cmvn_stats; diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index 9c0d5df5288..d8fc215b80b 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -19,7 +19,9 @@ #ifdef __IS_HIP_COMPILE__ #include + #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 5625592a717..b0e4a24c8d2 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -21,6 +21,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu index 6ba45e682c1..60fe113d402 100644 --- a/src/cudafeat/feature-window-cuda.cu +++ b/src/cudafeat/feature-window-cuda.cu @@ -19,6 +19,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc index 650b51ec3c7..7736f525237 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include @@ -100,7 +101,8 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda( current_samples_stash_ = new int32_t[num_channels_]; // allocated pinned memory for storing channel desc - CU_SAFE_CALL(cudaMallocHost((void**)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); + CU_SAFE_CALL( + cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); // allocate device memory lanes_ = diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index dffc9fd3c8f..b7128dec7e6 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -17,6 +17,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include @@ -34,9 +35,12 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Specialize WarpReduce for type float typedef cub::WarpReduce WarpReduce; // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps - __shared__ typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; + __shared__ + typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; - __shared__ float s_A[GPU_MAX_WARPS_PER_BLOCK][GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose + __shared__ float + s_A[GPU_MAX_WARPS_PER_BLOCK] + [GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose int bid = blockIdx.x; // batch id int tid = threadIdx.x; // thread id @@ -47,13 +51,15 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Offset to input vector to starting column for batch const float* __restrict__ X_in = X + bid * ldx; - for (int i = 0; i < cols; i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present + for (int i = 0; i < cols; + i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present int c = i + tid; float sum = 0.0f; // Perform dot product for (int j = 0; j < rows; - j += GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present + j += + GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present int r = j + wid; float val = 0.0f; @@ -139,7 +145,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows, int32_t lda, float scale, float* retval) { // Specialize WarpReduce for type float - typedef cub::BlockReduce + typedef cub::BlockReduce BlockReduce; // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps __shared__ typename BlockReduce::TempStorage temp_storage; @@ -207,7 +215,8 @@ __global__ void update_linear_and_quadratic_terms_kernel( void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, const float* AT, int B_stride, const float* B, float* C) { - batched_gemv_reduce_kernel<<>>( + batched_gemv_reduce_kernel<<>>( rows, cols, AT, A_stride, B, B_stride, C); CU_SAFE_CALL(cudaGetLastError()); } @@ -215,8 +224,11 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left, int32_t size, const float* feats, int32_t ldf, float* sfeats, int32_t lds) { - int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size - if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads splice_features_kernel<<>>( num_frames, feat_dim, left, size, feats, ldf, sfeats, lds); diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index fa0e9f68237..f96b2a81ce2 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -18,6 +18,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" // The BLAS enumerators are used instead of the SOLVER ones. #ifdef CUBLAS_FILL_MODE_LOWER diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index abd08a9b015..c4cceedca48 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -26,6 +26,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include @@ -33,7 +34,6 @@ #include #endif - #include #include #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 1ed7e54b541..3edd9f1ca40 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -24,9 +24,10 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include #include #include +#include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 1fd80502cf9..b8c250c6771 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -30,6 +30,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 333e8fbed1c..2a29338aeb1 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -24,6 +24,7 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index fd17fe61893..63cf33f98b2 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 2e77062f20d..938ec679f68 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -24,6 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include + #include "hipify.h" #define API_NAME_PREFIX "HIP" #else @@ -59,7 +60,7 @@ NvtxTracer::~NvtxTracer() { #ifdef __IS_HIP_COMPILE__ roctxRangePop(); #else - nvtxRangePop(); + nvtxRangePop(); #endif } #endif @@ -102,19 +103,31 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, const char* cublasGetStatusStringK(cublasStatus_t status) { // Defined in CUDA include file: cublas.h or cublas_api.h switch(status) { - case CUBLAS_STATUS_SUCCESS: return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_SUCCESS: + return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; #ifdef __IS_HIP_COMPILE__ - case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; - case HIPBLAS_STATUS_INVALID_ENUM: return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: + return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_INVALID_ENUM: + return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; #endif } return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR"; @@ -124,20 +137,32 @@ const char* cusparseGetStatusString(cusparseStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust // Defined in CUDA include file: cusparse.h switch(status) { - case CUSPARSE_STATUS_SUCCESS: return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSPARSE_STATUS_ZERO_PIVOT: return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; - #if CUDA_VERSION >= 11000 - case CUSPARSE_STATUS_NOT_SUPPORTED: return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; - case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; - #endif + case CUSPARSE_STATUS_SUCCESS: + return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: + return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: + return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; +#if CUDA_VERSION >= 11000 + case CUSPARSE_STATUS_NOT_SUPPORTED: + return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; + case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: + return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; +#endif } return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR"; } @@ -146,21 +171,35 @@ const char* curandGetStatusString(curandStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html // Defined in CUDA include file: curand.h switch(status) { - case CURAND_STATUS_SUCCESS: return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_SUCCESS: + return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; #ifdef __IS_HIP_COMPILE__ - case HIPRAND_STATUS_NOT_IMPLEMENTED: return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; + case HIPRAND_STATUS_NOT_IMPLEMENTED: + return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; #endif } return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 41ef7536a7f..934668da6f2 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -32,11 +32,12 @@ #if HAVE_CUDA #ifdef __IS_HIP_COMPILE__ -#include -#include #include +#include #include +#include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index e42c93f1b67..bb4017de9bb 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 4d0be20ddc3..fd2c0c64f1f 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -24,15 +24,16 @@ #if HAVE_CUDA == 1 #ifdef __IS_HIP_COMPILE__ -#include #include #include +#include + #include "hipify.h" #else #include #include #include -#endif // __IS_HIP_COMPILE__ +#endif // __IS_HIP_COMPILE__ #include #include #include @@ -246,12 +247,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) { return; } else { // Suggest to use compute exclusive mode - #ifdef __IS_HIP_COMPILE__ +#ifdef __IS_HIP_COMPILE__ KALDI_WARN << "Not in compute-exclusive mode."; - #else +#else KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use " "'nvidia-smi -c 3' to set compute exclusive mode"; - #endif +#endif // We want to choose the device more carefully, so release the CUDA context. e = cudaDeviceReset(); if (e != cudaSuccess) { diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index bb1170314c4..fe8ac795560 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -29,11 +29,12 @@ #include #ifdef __IS_HIP_COMPILE__ -#include -#include #include #include +#include #include +#include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 3d7fae5c15e..8d5784acb52 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -31,18 +31,18 @@ #define __CUDA_ARCH__ 800 #include #include + #include "hipify.h" -#define CUDART_INF HIP_INF -#define CUDART_INF_F HIP_INF_F -#include "cudamatrix/cu-kernels-ansi.h" -#include #include +#include + +#include "cudamatrix/cu-kernels-ansi.h" #else #include #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION -#endif //__IS_HIP_COMPILE__ +#endif //__IS_HIP_COMPILE__ /*********************************************************************** * Generic __device__ functions @@ -1122,7 +1122,7 @@ void trace_mat_mat_trans_atomic(Real *d_result, // Assuming *d_result is set to zero already constexpr int THREADS_X = GPU_WARP_SIZE; - constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK/2; + constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2; dim3 thrds(THREADS_X, THREADS_Y); @@ -2111,7 +2111,7 @@ static void _group_transform_reduce( // tree-reduce to 2x warpSize elements per group int shift = threads_per_group / 2; -# pragma unroll +#pragma unroll for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); @@ -4009,9 +4009,9 @@ struct BatchedMatrixCopyDesc { MatrixCopyDesc batch[MAX_BATCH_SIZE]; }; -// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) -// grid dim x,y expands to fill out average in x/y across batches -// grid dim.z is batch +// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE +// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y +// expands to fill out average in x/y across batches grid dim.z is batch template __global__ void _cuda_batch_copy_mats(BatchedMatrixCopyDesc batch_desc) { @@ -4390,7 +4390,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B, void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat <<>>(A,B,dA,B_stride,value); + _trace_mat_mat<<>>(A, B, dA, B_stride, value); } void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, @@ -4413,8 +4413,8 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, v, stride_v); #ifdef __IS_HIP_COMPILE__ } else if (Bl.x == 64) { - _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, - v, stride_v); + _add_diag_mat_mat_MTN<64> + <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); #endif } } @@ -4426,10 +4426,10 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha, if (Bl.x == 16) { _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { - _add_diag_mat_mat_MN<32> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<32><<>>(alpha, M, stride_M, N, dim_N, beta, v); #ifdef __IS_HIP_COMPILE__ - } else if (Bl.x==64) { - _add_diag_mat_mat_MN<64> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + } else if (Bl.x == 64) { + _add_diag_mat_mat_MN<64><<>>(alpha, M, stride_M, N, dim_N, beta, v); #endif } } @@ -5105,7 +5105,7 @@ void cudaD_trace_mat_mat_trans(const double* A, void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat <<>>(A,B,dA,B_stride,value); + _trace_mat_mat<<>>(A, B, dA, B_stride, value); } void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha, @@ -5128,8 +5128,8 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, v, stride_v); #ifdef __IS_HIP_COMPILE__ } else if (Bl.x == 64) { - _add_diag_mat_mat_MTN<64> <<>>(alpha, M, stride_M, N, dim_N, beta, - v, stride_v); + _add_diag_mat_mat_MTN<64> + <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); #endif } } @@ -5141,10 +5141,10 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha, if (Bl.x == 16) { _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { - _add_diag_mat_mat_MN<32> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<32><<>>(alpha, M, stride_M, N, dim_N, beta, v); #ifdef __IS_HIP_COMPILE__ - } else if (Bl.x==64) { - _add_diag_mat_mat_MN<64> <<>>(alpha,M,stride_M,N,dim_N,beta,v); + } else if (Bl.x == 64) { + _add_diag_mat_mat_MN<64><<>>(alpha, M, stride_M, N, dim_N, beta, v); #endif } } @@ -5516,25 +5516,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim, @@ -5831,8 +5831,9 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, // this will synchronize all threads without blocking. void cuda_legacy_noop() { #ifdef __IS_HIP_COMPILE__ - // HIP doesn't currently support cudaStreamLegacy stream so we force the implementation to use the - // legacy (not per-thread) API to get similar semantics. + // HIP doesn't currently support cudaStreamLegacy stream so we force the + // implementation to use the legacy (not per-thread) API to get similar + // semantics. auto k = reinterpret_cast(_noop_kernel); hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0); #else @@ -5847,8 +5848,10 @@ void cudaF_mat_copy_range_clamped( float *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); - dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks( + (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5861,8 +5864,10 @@ void cudaD_mat_copy_range_clamped( double *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); - dim3 blocks((num_cols+GPU_WARP_SIZE-1)/GPU_WARP_SIZE,(num_rows+GPU_MAX_WARPS_PER_BLOCK-1)/GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks( + (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5871,8 +5876,7 @@ void cudaD_mat_copy_range_clamped( void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { - - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5898,9 +5902,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - MAX_BATCH_SIZE); + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5920,10 +5925,11 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - remaining); + + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + remaining); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5936,8 +5942,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { - - dim3 threads(GPU_WARP_SIZE,GPU_MAX_WARPS_PER_BLOCK); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5963,9 +5968,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - MAX_BATCH_SIZE); + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5986,10 +5992,11 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, - (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, - remaining); - + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + remaining); + // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index ecddd24db19..dfcaf30770a 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2679,7 +2679,7 @@ static void UnitTestCuMatrixSetRandUniform() { MatrixIndexT rows = 180 + Rand() % 200, cols = 200 + Rand() % 200; CuMatrix M(rows, cols); M.SetRandUniform(); - + M.Add(-0.5); // we'll be testing the central moments, so // center it around zero first. // Got these moments from http://mathworld.wolfram.com/UniformDistribution.html @@ -2705,11 +2705,11 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; - KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment + KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } - } + } } } diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index fd31758f0e6..53831a52bc8 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -30,6 +30,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index c9d686d0ce8..001170fdeca 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -24,6 +24,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index a6c7d7720e4..96085848d72 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index cda575b1914..81ecbe68080 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 378cc8e4e38..da19a31b39a 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -22,6 +22,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index c88b3ebf50c..6667f2bca62 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -25,6 +25,7 @@ #ifdef __IS_HIP_COMPILE__ #include #include + #include "hipify.h" #else #include @@ -649,8 +650,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, const int32 warpSize = GPU_WARP_SIZE; const int32 kOptNumBlocks = 512; const int32 tile_dim = - (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? - GPU_WARP_SIZE/2 : GPU_WARP_SIZE; + (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) + ? GPU_WARP_SIZE / 2 + : GPU_WARP_SIZE; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), n_blocks(N.NumRows(), dimBlock.y)); @@ -676,8 +678,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access. // One block per 'tile_dim' columns of N. // 1D grid expands along the row of N. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; + int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 + ? GPU_WARP_SIZE + : GPU_WARP_SIZE / 2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), @@ -685,8 +688,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, } else { // Case 4: diag(M'*N') == sum(N'.*M, 1) // Same kernel and config as case 3 except M and N are swapped. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? GPU_WARP_SIZE : GPU_WARP_SIZE/2; + int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 + ? GPU_WARP_SIZE + : GPU_WARP_SIZE / 2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(M.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(), diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h index dc5c0e0ced5..537cca9b97f 100644 --- a/src/cudamatrix/cublas-wrappers.h +++ b/src/cudamatrix/cublas-wrappers.h @@ -37,8 +37,9 @@ inline cublasStatus_t cublas_gemm( const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { #if CUDA_VERSION >= 11000 - return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUBLAS_R_32F,lda,B,CUBLAS_R_32F,ldb,&beta, - C,CUBLAS_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), + return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F, + lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc, + CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); @@ -66,9 +67,11 @@ inline cublasStatus_t cublas_gemmBatched( const float *A[], int lda, const float *B[], int ldb, float beta, float *C[], int ldc, int batchCount) { #if CUDA_VERSION >= 11000 - return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUBLAS_R_32F, lda, - (const void**)B, CUBLAS_R_32F, ldb, &beta, (void**)C, CUBLAS_R_32F, ldc, batchCount, - CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); + return cublasGemmBatchedEx( + handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F, + lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F, + ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(), + CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); #endif diff --git a/src/hip/hipify.h b/src/hip/hipify.h index efe4848c009..e9ca483d022 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -2,250 +2,262 @@ #define __HIPIFY_H__ #ifdef __HIPCC__ -inline __device__ void __syncwarp(unsigned mask=0xffffffff) { - // On CDNA hardware wave-fronts (warps) execute always in - // lock step. Though it might still be important to signal - // that the compiler can't reorder code around certain code - // sections that rely on data sharing mecanisms like LDS - // (shared memory). So this implements a No-op but is seen - // by the compiler as having side effects. - __asm__("s_nop 0"); +inline __device__ void __syncwarp(unsigned mask = 0xffffffff) { + // On CDNA hardware wave-fronts (warps) execute always in + // lock step. Though it might still be important to signal + // that the compiler can't reorder code around certain code + // sections that rely on data sharing mecanisms like LDS + // (shared memory). So this implements a No-op but is seen + // by the compiler as having side effects. + __asm__("s_nop 0"); - // A saffest option, arguably less performant would be to use: - // __asm__("s_waitcnt lgkmcnt(0)"); Í - // to explicitly do a memory fence. + // A saffest option, arguably less performant would be to use: + // __asm__("s_waitcnt lgkmcnt(0)"); Í + // to explicitly do a memory fence. } // AMDGCN only support this rounding mode. #define __fdiv_rd __fdiv_rn #else -#define __align__(x) __attribute__((aligned (x))) +#define __align__(x) __attribute__((aligned(x))) #endif // // HIP types // -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. -#define CUBLAS_COMPUTE_32F_FAST_TF32 HIPBLAS_R_32F // TODO: Verify that plain float compute are viable replacements for the tensor cores alternative. -#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT -#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER -#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements for explicit tensor GEMMs. -#define CUBLAS_OP_C HIPBLAS_OP_C -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_R_32F HIPBLAS_R_32F -#define CUBLAS_R_64F HIPBLAS_R_64F -#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUDA_R_32F HIP_R_32F -#define CUDA_R_64F HIP_R_64F -#define CUFFT_R2C HIPFFT_R2C -#define CUFFT_SUCCESS HIPFFT_SUCCESS -#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT -#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED -#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH -#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED -#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR -#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE -#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE -#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED -#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE -#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE -#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS -#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR -#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH -#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC -#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I -#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO -#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE -#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE -#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN -#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 -#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED -#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH -#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED -#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES -#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR -#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE -#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR -#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED -#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED -#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS -#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT -#define cuDeviceGetName hipDeviceGetName -#define cuMemGetInfo_v2 hipMemGetInfo -#define cublasComputeType_t hipblasDatatype_t -#define cublasCreate hipblasCreate -#define cublasDasum_v2 hipblasDasum -#define cublasDaxpy_v2 hipblasDaxpy -#define cublasDcopy_v2 hipblasDcopy -#define cublasDdot_v2 hipblasDdot -#define cublasDestroy hipblasDestroy -#define cublasDgemmBatched hipblasDgemmBatched -#define cublasDgemm_v2 hipblasDgemm -#define cublasDgemv_v2 hipblasDgemv -#define cublasDger_v2 hipblasDger -#define cublasDnrm2_v2 hipblasDnrm2 -#define cublasDscal_v2 hipblasDscal -#define cublasDspmv_v2 hipblasDspmv -#define cublasDspr_v2 hipblasDspr -#define cublasDsyrk_v2 hipblasDsyrk -#define cublasDtpmv_v2 hipblasDtpmv -#define cublasDtrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasDtrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define cublasFillMode_t hipblasFillMode_t -#define cublasGemmAlgo_t hipblasGemmAlgo_t -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmEx hipblasGemmEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasOperation_t hipblasOperation_t -#define cublasSasum_v2 hipblasSasum -#define cublasSaxpy_v2 hipblasSaxpy -#define cublasScopy_v2 hipblasScopy -#define cublasSdot_v2 hipblasSdot -#define cublasSetStream hipblasSetStream -#define cublasSgemv_v2 hipblasSgemv -#define cublasSger_v2 hipblasSger -#define cublasSnrm2_v2 hipblasSnrm2 -#define cublasSscal_v2 hipblasSscal -#define cublasSspmv_v2 hipblasSspmv -#define cublasSspr_v2 hipblasSspr -#define cublasSsyrk_v2 hipblasSsyrk -#define cublasStatus_t hipblasStatus_t -#define cublasStatus_t hipblasStatus_t -#define cublasStpmv_v2 hipblasStpmv -#define cublasStrsm_v2(a,b,c,d,e,f,g,h,i,j,k,l) hipblasStrsm(a,b,c,d,e,f,g,h,const_cast(i),j,k,l) -#define cudaComputeModeExclusive hipComputeModeExclusive -#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess -#define cudaDataType hipDataType -#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize -#define cudaDeviceGetAttribute hipDeviceGetAttribute -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceReset hipDeviceReset -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse -#define cudaErrorInvalidDevice hipErrorInvalidDevice -#define cudaError_t hipError_t -#define cudaEventCreate hipEventCreate -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDestroy hipEventDestroy -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaFree hipFree -#define cudaFreeHost hipFreeHost -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorName hipGetErrorName -#define cudaGetErrorString hipGetErrorString -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterDefault hipHostRegisterDefault -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#define cudaMalloc hipMalloc -#define cudaMallocHost hipHostMalloc -#define cudaMallocPitch hipMallocPitch -#define cudaMemcpy hipMemcpy -// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F \ + HIPBLAS_R_32F // TODO: Verify that plain float compute are viable + // replacements for the tensor cores alternative. +#define CUBLAS_COMPUTE_32F_FAST_TF32 \ + HIPBLAS_R_32F // TODO: Verify that plain float compute are viable + // replacements for the tensor cores alternative. +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT +#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP \ + HIPBLAS_GEMM_DEFAULT // TODO: Verify regular GEMMs are viable replacements + // for explicit tensor GEMMs. +#define CUBLAS_OP_C HIPBLAS_OP_C +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_R_32F HIPBLAS_R_32F +#define CUBLAS_R_64F HIPBLAS_R_64F +#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUDA_R_32F HIP_R_32F +#define CUDA_R_64F HIP_R_64F +#define CUFFT_R2C HIPFFT_R2C +#define CUFFT_SUCCESS HIPFFT_SUCCESS +#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED \ + HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN +#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2 +#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED +#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH +#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED +#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES \ + HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES +#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR +#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE +#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR +#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED \ + HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED +#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED +#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT +#define cuDeviceGetName hipDeviceGetName +#define cuMemGetInfo_v2 hipMemGetInfo +#define cublasComputeType_t hipblasDatatype_t +#define cublasCreate hipblasCreate +#define cublasDasum_v2 hipblasDasum +#define cublasDaxpy_v2 hipblasDaxpy +#define cublasDcopy_v2 hipblasDcopy +#define cublasDdot_v2 hipblasDdot +#define cublasDestroy hipblasDestroy +#define cublasDgemmBatched hipblasDgemmBatched +#define cublasDgemm_v2 hipblasDgemm +#define cublasDgemv_v2 hipblasDgemv +#define cublasDger_v2 hipblasDger +#define cublasDnrm2_v2 hipblasDnrm2 +#define cublasDscal_v2 hipblasDscal +#define cublasDspmv_v2 hipblasDspmv +#define cublasDspr_v2 hipblasDspr +#define cublasDsyrk_v2 hipblasDsyrk +#define cublasDtpmv_v2 hipblasDtpmv +#define cublasDtrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \ + hipblasDtrsm(a, b, c, d, e, f, g, h, const_cast(i), j, k, l) +#define cublasFillMode_t hipblasFillMode_t +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmEx hipblasGemmEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasOperation_t hipblasOperation_t +#define cublasSasum_v2 hipblasSasum +#define cublasSaxpy_v2 hipblasSaxpy +#define cublasScopy_v2 hipblasScopy +#define cublasSdot_v2 hipblasSdot +#define cublasSetStream hipblasSetStream +#define cublasSgemv_v2 hipblasSgemv +#define cublasSger_v2 hipblasSger +#define cublasSnrm2_v2 hipblasSnrm2 +#define cublasSscal_v2 hipblasSscal +#define cublasSspmv_v2 hipblasSspmv +#define cublasSspr_v2 hipblasSspr +#define cublasSsyrk_v2 hipblasSsyrk +#define cublasStatus_t hipblasStatus_t +#define cublasStatus_t hipblasStatus_t +#define cublasStpmv_v2 hipblasStpmv +#define cublasStrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \ + hipblasStrsm(a, b, c, d, e, f, g, h, const_cast(i), j, k, l) +#define cudaComputeModeExclusive hipComputeModeExclusive +#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess +#define cudaDataType hipDataType +#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceReset hipDeviceReset +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse +#define cudaErrorInvalidDevice hipErrorInvalidDevice +#define cudaError_t hipError_t +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDestroy hipEventDestroy +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipFreeHost +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorName hipGetErrorName +#define cudaGetErrorString hipGetErrorString +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterDefault hipHostRegisterDefault +#define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc +#define cudaMalloc hipMalloc +#define cudaMallocHost hipHostMalloc +#define cudaMallocPitch hipMallocPitch +#define cudaMemcpy hipMemcpy +// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized // copies, which should be canceled by ROCm 5.7.1+. Then the following would // be sufficient: // #define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpy2DAsync(a,b,c,d,width,height,e,f) \ - [&]() -> hipError_t { \ - if (width && height) \ - return hipMemcpy2DAsync(a,b,c,d,width,height,e,f); \ - return hipSuccess; \ - }() -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemGetInfo hipMemGetInfo -#define cudaMemset2DAsync hipMemset2DAsync -#define cudaMemsetAsync hipMemsetAsync -#define cudaProfilerStop hipProfilerStop -#define cudaSetDevice hipSetDevice -#define cudaStreamCreate hipStreamCreate -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread ((hipStream_t)2) -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent hipStreamWaitEvent -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define cufftComplex hipfftComplex -#define cufftDestroy hipfftDestroy -#define cufftExecR2C hipfftExecR2C -#define cufftHandle hipfftHandle -#define cufftPlanMany hipfftPlanMany -#define cufftSetStream hipfftSetStream -#define curandCreateGenerator hiprandCreateGenerator -#define curandDestroyGenerator hiprandDestroyGenerator -#define curandGenerateNormal hiprandGenerateNormal -#define curandGenerateNormalDouble hiprandGenerateNormalDouble -#define curandGenerateUniform hiprandGenerateUniform -#define curandGenerateUniformDouble hiprandGenerateUniformDouble -#define curandGenerator_t hiprandGenerator_t -#define curandSetGeneratorOffset hiprandSetGeneratorOffset -#define curandSetGeneratorOrdering(x,y) 0 // HIP does not support generator ordeing. -#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed -#define curandSetStream hiprandSetStream -#define curandStatus_t hiprandStatus_t -#define cusolverDnCreate hipsolverDnCreate -#define cusolverDnDestroy hipsolverDnDestroy -#define cusolverDnHandle_t hipsolverDnHandle_t -#define cusolverDnSetStream hipsolverDnSetStream -#define cusolverDnSpotrf hipsolverDnSpotrf -#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched -#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize -#define cusolverDnSpotrs hipsolverDnSpotrs -#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched -#define cusparseAction_t hipsparseAction_t -#define cusparseCreate hipsparseCreate -#define cusparseCreateCsr hipsparseCreateCsr -#define cusparseCreateDnMat hipsparseCreateDnMat -#define cusparseCreateMatDescr hipsparseCreateMatDescr -#define cusparseDcsr2csc hipsparseDcsr2csc -#define cusparseDestroy hipsparseDestroy -#define cusparseDestroy hipsparseDestroy -#define cusparseDestroyDnMat hipsparseDestroyDnMat -#define cusparseDestroyMatDescr hipsparseDestroyMatDescr -#define cusparseDestroySpMat hipsparseDestroySpMat -#define cusparseDnMatDescr_t hipsparseDnMatDescr_t -#define cusparseGetMatIndexBase hipsparseGetMatIndexBase -#define cusparseHandle_t hipsparseHandle_t -#define cusparseIndexBase_t hipsparseIndexBase_t -#define cusparseMatDescr_t hipsparseMatDescr_t -#define cusparseOperation_t hipsparseOperation_t -#define cusparseScsr2csc hipsparseScsr2csc -#define cusparseSetStream hipsparseSetStream -#define cusparseSpMM hipsparseSpMM -#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize -#define cusparseSpMatDescr_t hipsparseSpMatDescr_t -#define cusparseStatus_t hipsparseStatus_t -#define nvtxRangePop roctxRangePop -#define nvtxRangePush roctxRangePush -#define nvtxRangePushA roctxRangePushA +#define cudaMemcpy2DAsync(a, b, c, d, width, height, e, f) \ + [&]() -> hipError_t { \ + if (width && height) \ + return hipMemcpy2DAsync(a, b, c, d, width, height, e, f); \ + return hipSuccess; \ + }() +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemGetInfo hipMemGetInfo +#define cudaMemset2DAsync hipMemset2DAsync +#define cudaMemsetAsync hipMemsetAsync +#define cudaProfilerStop hipProfilerStop +#define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread ((hipStream_t)2) +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define cufftComplex hipfftComplex +#define cufftDestroy hipfftDestroy +#define cufftExecR2C hipfftExecR2C +#define cufftHandle hipfftHandle +#define cufftPlanMany hipfftPlanMany +#define cufftSetStream hipfftSetStream +#define curandCreateGenerator hiprandCreateGenerator +#define curandDestroyGenerator hiprandDestroyGenerator +#define curandGenerateNormal hiprandGenerateNormal +#define curandGenerateNormalDouble hiprandGenerateNormalDouble +#define curandGenerateUniform hiprandGenerateUniform +#define curandGenerateUniformDouble hiprandGenerateUniformDouble +#define curandGenerator_t hiprandGenerator_t +#define curandSetGeneratorOffset hiprandSetGeneratorOffset +#define curandSetGeneratorOrdering(x, y) \ + 0 // HIP does not support generator ordeing. +#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed +#define curandSetStream hiprandSetStream +#define curandStatus_t hiprandStatus_t +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cusolverDnSetStream hipsolverDnSetStream +#define cusolverDnSpotrf hipsolverDnSpotrf +#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched +#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize +#define cusolverDnSpotrs hipsolverDnSpotrs +#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched +#define cusparseAction_t hipsparseAction_t +#define cusparseCreate hipsparseCreate +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseCreateDnMat hipsparseCreateDnMat +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDcsr2csc hipsparseDcsr2csc +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroy hipsparseDestroy +#define cusparseDestroyDnMat hipsparseDestroyDnMat +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseDnMatDescr_t hipsparseDnMatDescr_t +#define cusparseGetMatIndexBase hipsparseGetMatIndexBase +#define cusparseHandle_t hipsparseHandle_t +#define cusparseIndexBase_t hipsparseIndexBase_t +#define cusparseMatDescr_t hipsparseMatDescr_t +#define cusparseOperation_t hipsparseOperation_t +#define cusparseScsr2csc hipsparseScsr2csc +#define cusparseSetStream hipsparseSetStream +#define cusparseSpMM hipsparseSpMM +#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseStatus_t hipsparseStatus_t +#define nvtxRangePop roctxRangePop +#define nvtxRangePush roctxRangePush +#define nvtxRangePushA roctxRangePushA // // HIPCUB namespace. // @@ -256,8 +268,16 @@ inline __device__ void __syncwarp(unsigned mask=0xffffffff) { // #define CUDART_CB +// +// Math constants +// +#define CUDART_INF HIP_INF +#define CUDART_INF_F HIP_INF_F + +// +// GPU static hardware characteristics. +// #define GPU_WARP_SIZE 64 #define GPU_MAX_THREADS_PER_BLOCK 1024 -#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK/GPU_WARP_SIZE) -#endif //__HIPIFY_H__ - +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE) +#endif //__HIPIFY_H__ From 3aaa32637850c919af905b1c799b3f4919d804cd Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 7 Nov 2023 00:00:01 +0000 Subject: [PATCH 45/76] Fix more formating to Google style. --- src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu | 3 ++- src/cudamatrix/cu-kernels.cu | 2 +- src/hip/hipify.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index da2ba24bd90..5b94c34e829 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -222,7 +222,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size if (threads > GPU_MAX_THREADS_PER_BLOCK) - threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is GPU_MAX_THREADS_PER_BLOCK threads + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(num_chunk_frames, num_lanes); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8d5784acb52..9127819eca5 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -32,11 +32,11 @@ #include #include -#include "hipify.h" #include #include #include "cudamatrix/cu-kernels-ansi.h" +#include "hipify.h" #else #include #include "cudamatrix/cu-kernels-ansi.h" diff --git a/src/hip/hipify.h b/src/hip/hipify.h index e9ca483d022..459372e68b8 100644 --- a/src/hip/hipify.h +++ b/src/hip/hipify.h @@ -275,7 +275,7 @@ inline __device__ void __syncwarp(unsigned mask = 0xffffffff) { #define CUDART_INF_F HIP_INF_F // -// GPU static hardware characteristics. +// GPU static hardware characteristics. // #define GPU_WARP_SIZE 64 #define GPU_MAX_THREADS_PER_BLOCK 1024 From 6ebab7023b01a4270cbd07b5c3bfce7f1ca2c461 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 7 Nov 2023 00:25:49 +0000 Subject: [PATCH 46/76] Fix header ordering. --- src/cudamatrix/cu-kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 9127819eca5..9df6cea6e9d 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -32,8 +32,8 @@ #include #include -#include #include +#include #include "cudamatrix/cu-kernels-ansi.h" #include "hipify.h" From 7efdeaeb10ed0ae2593ee69faa04b5172a39aba9 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Tue, 7 Nov 2023 05:16:09 -0600 Subject: [PATCH 47/76] Add GPU characteristics for CUDA. --- src/cudamatrix/cu-common.h | 4 ++++ src/cudamatrix/cu-kernels.cu | 1 + 2 files changed, 5 insertions(+) diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 934668da6f2..3206fe7e7f4 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -45,6 +45,10 @@ #include #include #include + +#define GPU_WARP_SIZE 32 +#define GPU_MAX_THREADS_PER_BLOCK 1024 +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE) #endif #define CU_SAFE_CALL(fun) \ diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 9df6cea6e9d..b3c3165bd96 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -39,6 +39,7 @@ #include "hipify.h" #else #include +#include "cudamatrix/cu-common.h" #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION From 700bf93631b8c999f0421fffd74b4a29eb2685a3 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 9 Nov 2023 11:49:12 +0100 Subject: [PATCH 48/76] [tools] Replace uses of Python distutils The `distutils` packages has been removed in Python 3.12: https://docs.python.org/3.11/distutils/index.html The `sysconfig` package is available since Python 3.2 and provides the necessary replacement functionality: https://docs.python.org/3/library/sysconfig.html --- tools/extras/install_cffi.sh | 2 +- tools/extras/install_mmseg.sh | 8 ++++---- tools/extras/install_phonetisaurus.sh | 8 ++++---- tools/extras/install_sequitur.sh | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/extras/install_cffi.sh b/tools/extras/install_cffi.sh index dc7f91724a7..5ac9904173e 100755 --- a/tools/extras/install_cffi.sh +++ b/tools/extras/install_cffi.sh @@ -35,7 +35,7 @@ echo "**** Installing Cffi and dependencies" echo "Checking for Python-Dev" # copied from https://stackoverflow.com/questions/4848566/check-for-existence-of-python-dev-files-from-bash-script -if [ ! -e $(python -c 'from distutils.sysconfig import get_makefile_filename as m; print m()') ]; then +if [ ! -e $(python -c 'from sysconfig import get_makefile_filename as m; print m()') ]; then echo "On Debian/Ubuntu like system install by 'sudo apt-get python-dev' package." echo "On Fedora by 'yum install python-devel'" echo "On Mac OS X by 'brew install python'" diff --git a/tools/extras/install_mmseg.sh b/tools/extras/install_mmseg.sh index a76b98e2061..e6e17716718 100755 --- a/tools/extras/install_mmseg.sh +++ b/tools/extras/install_mmseg.sh @@ -16,13 +16,13 @@ fi # Install python-devel package if not already available -# first, makes sure distutils.sysconfig usable -if ! $(python -c "import distutils.sysconfig" &> /dev/null); then - echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2 +# first, makes sure sysconfig is usable +if ! $(python -c "import sysconfig" &> /dev/null); then + echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2 echo "Proceeding with installation." >&2 else # get include path for this python version - INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())") + INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))") if [ ! -f "${INCLUDE_PY}/Python.h" ]; then echo "$0 : ERROR: python-devel/python-dev not installed" >&2 if which yum >&/dev/null; then diff --git a/tools/extras/install_phonetisaurus.sh b/tools/extras/install_phonetisaurus.sh index 8a07c5f5ca5..e407978972f 100755 --- a/tools/extras/install_phonetisaurus.sh +++ b/tools/extras/install_phonetisaurus.sh @@ -15,16 +15,16 @@ fi echo "You must call this script from the tools/ directory" && exit 1; # Install python-devel package if not already available -# first, makes sure distutils.sysconfig usable +# first, makes sure sysconfig is usable # We are not currently compiling the bindings by default, but it seems # worth it to keep this section as we do have them and they will # probably be used. -if ! $(python -c "import distutils.sysconfig" &> /dev/null); then - echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2 +if ! $(python -c "import sysconfig" &> /dev/null); then + echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2 echo "Proceeding with installation." >&2 else # get include path for this python version - INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())") + INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))") if [ ! -f "${INCLUDE_PY}/Python.h" ]; then echo "$0 : ERROR: python-devel/python-dev not installed" >&2 if which yum >&/dev/null; then diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh index b70e6cbb447..62b27e451ac 100755 --- a/tools/extras/install_sequitur.sh +++ b/tools/extras/install_sequitur.sh @@ -15,13 +15,13 @@ fi echo "You must call this script from the tools/ directory" && exit 1; # Install python-devel package if not already available -# first, makes sure distutils.sysconfig usable -if ! $(python -c "import distutils.sysconfig" &> /dev/null); then - echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2 +# first, makes sure sysconfig is usable +if ! $(python -c "import sysconfig" &> /dev/null); then + echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2 echo "Proceeding with installation." >&2 else # get include path for this python version - INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())") + INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))") if [ ! -f "${INCLUDE_PY}/Python.h" ]; then echo "$0 : ERROR: python-devel/python-dev not installed" >&2 if which yum >&/dev/null; then From cd2b8354b7d2e3a734b8d87d44c566cb4d8f2d0e Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 10 Nov 2023 16:39:48 +0800 Subject: [PATCH 49/76] Fix #4870, spurious error in ProcessNonemitting; queue can validly be empty. # Conflicts: # src/cudamatrix/cu-kernels.cu --- src/decoder/lattice-simple-decoder.cc | 70 ++++++++++++--------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc index cc8712e854d..87378f93bbd 100644 --- a/src/decoder/lattice-simple-decoder.cc +++ b/src/decoder/lattice-simple-decoder.cc @@ -45,8 +45,8 @@ void LatticeSimpleDecoder::InitDecoding() { bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) { InitDecoding(); - - while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) { + + while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) { if (NumFramesDecoded() % config_.prune_interval == 0) PruneActiveTokens(config_.lattice_beam * config_.prune_scale); ProcessEmitting(decodable); @@ -57,7 +57,7 @@ bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) { ProcessNonemitting(); } FinalizeDecoding(); - + // Returns true if we have any kind of traceback available (not necessarily // to the end state; query ReachedFinal() for that). return !final_costs_.empty(); @@ -88,9 +88,9 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst, if (decoding_finalized_ && !use_final_probs) KALDI_ERR << "You cannot call FinalizeDecoding() and then call " << "GetRawLattice() with use_final_probs == false"; - + unordered_map final_costs_local; - + const unordered_map &final_costs = (decoding_finalized_ ? final_costs_ : final_costs_local); @@ -100,7 +100,7 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst, ofst->DeleteStates(); int32 num_frames = NumFramesDecoded(); KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_/2 + 3; + const int32 bucket_count = num_toks_/2 + 3; unordered_map tok_map(bucket_count); // First create all states. for (int32 f = 0; f <= num_frames; f++) { @@ -169,10 +169,10 @@ bool LatticeSimpleDecoder::GetLattice( fst::ILabelCompare ilabel_comp; ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes // lattice-determinization more efficient. - + fst::DeterminizeLatticePrunedOptions lat_opts; lat_opts.max_mem = config_.det_opts.max_mem; - + DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. Connect(ofst); // Remove unreachable states... there might be @@ -196,7 +196,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken( bool emitting, bool *changed) { KALDI_ASSERT(frame < active_toks_.size()); Token *&toks = active_toks_[frame].toks; - + unordered_map::iterator find_iter = cur_toks_.find(state); if (find_iter == cur_toks_.end()) { // no such token presently. // Create one. @@ -221,7 +221,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken( return tok; } } - + // delta is the amount by which the extra_costs must // change before it sets "extra_costs_changed" to true. If delta is larger, // we'll tend to go back less far toward the beginning of the file. @@ -242,7 +242,7 @@ void LatticeSimpleDecoder::PruneForwardLinks( warned_ = true; } } - + bool changed = true; while (changed) { changed = false; @@ -300,7 +300,7 @@ void LatticeSimpleDecoder::ComputeFinalCosts( BaseFloat infinity = std::numeric_limits::infinity(); BaseFloat best_cost = infinity, best_cost_with_final = infinity; - + for (unordered_map::const_iterator iter = cur_toks_.begin(); iter != cur_toks_.end(); ++iter) { StateId state = iter->first; @@ -336,19 +336,19 @@ void LatticeSimpleDecoder::ComputeFinalCosts( // on the final frame. If there are final tokens active, it uses the final-probs // for pruning, otherwise it treats all tokens as final. void LatticeSimpleDecoder::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); + KALDI_ASSERT(!active_toks_.empty()); int32 frame_plus_one = active_toks_.size() - 1; if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen. KALDI_WARN << "No tokens alive at end of file\n"; - typedef unordered_map::const_iterator IterType; + typedef unordered_map::const_iterator IterType; ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); decoding_finalized_ = true; // We're about to delete some of the tokens active on the final frame, so we // clear cur_toks_ because otherwise it would then contain dangling pointers. cur_toks_.clear(); - + // Now go through tokens on this frame, pruning forward links... may have to // iterate a few times until there is no more change, because the list is not // in topological order. This is a modified version of the code in @@ -429,7 +429,7 @@ BaseFloat LatticeSimpleDecoder::FinalRelativeCost() const { return final_relative_cost_; } } - + // Prune away any tokens on this frame that have no forward links. [we don't do // this in PruneForwardLinks because it would give us a problem with dangling // pointers]. @@ -453,14 +453,14 @@ void LatticeSimpleDecoder::PruneTokensForFrame(int32 frame) { } } } - + // Go backwards through still-alive tokens, pruning them, starting not from // the current frame (where we want to keep all tokens) but from the frame before // that. We go backwards through the frames and stop when we reach a point // where the delta-costs are not changing (and the delta controls when we consider // a cost to have "not changed"). void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); + int32 cur_frame_plus_one = NumFramesDecoded(); int32 num_toks_begin = num_toks_; // The index "f" below represents a "frame plus one", i.e. you'd have to subtract // one to get the corresponding index for the decodable object. @@ -468,7 +468,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { // Reason why we need to prune forward links in this situation: // (1) we have never pruned them // (2) we never pruned the forward links on the next frame, which - // + // if (active_toks_[f].must_prune_forward_links) { bool extra_costs_changed = false, links_pruned = false; PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); @@ -478,7 +478,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { active_toks_[f].must_prune_tokens = true; active_toks_[f].must_prune_forward_links = false; } - if (f+1 < cur_frame_plus_one && + if (f+1 < cur_frame_plus_one && active_toks_[f+1].must_prune_tokens) { PruneTokensForFrame(f+1); active_toks_[f+1].must_prune_tokens = false; @@ -493,20 +493,20 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { // (optionally) on the final frame. Takes into account the final-prob of // tokens. This function used to be called PruneActiveTokensFinal(). void LatticeSimpleDecoder::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); + int32 final_frame_plus_one = NumFramesDecoded(); int32 num_toks_begin = num_toks_; PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { + for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { bool b1, b2; // values not used. BaseFloat dontcare = 0.0; PruneForwardLinks(f, &b1, &b2, dontcare); PruneTokensForFrame(f + 1); } - PruneTokensForFrame(0); + PruneTokensForFrame(0); KALDI_VLOG(3) << "pruned tokens from " << num_toks_begin << " to " << num_toks_; } - + void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) { int32 frame = active_toks_.size() - 1; // frame is the frame-index // (zero-based) used to get likelihoods @@ -538,9 +538,9 @@ void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) { // AddToken adds the next_tok to cur_toks_ (if not already present). Token *next_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost, true, NULL); - + // Add ForwardLink from tok to next_tok (put on head of list tok->links) - tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, + tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, graph_cost, ac_cost, tok->links); } } @@ -553,7 +553,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() { // Note: "frame" is the time-index we just processed, or -1 if // we are processing the nonemitting transitions before the // first frame (called from InitDecoding()). - + // Processes nonemitting arcs for one frame. Propagates within // cur_toks_. Note-- this queue structure is is not very optimal as // it may cause us to process states unnecessarily (e.g. more than once), @@ -569,15 +569,9 @@ void LatticeSimpleDecoder::ProcessNonemitting() { queue.push_back(state); best_cost = std::min(best_cost, iter->second->tot_cost); } - if (queue.empty()) { - if (!warned_) { - KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is " - << frame; - warned_ = true; - } - } + BaseFloat cutoff = best_cost + config_.beam; - + while (!queue.empty()) { StateId state = queue.back(); queue.pop_back(); @@ -600,10 +594,10 @@ void LatticeSimpleDecoder::ProcessNonemitting() { bool changed; Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost, false, &changed); - + tok->links = new ForwardLink(new_tok, 0, arc.olabel, graph_cost, 0, tok->links); - + // "changed" tells us whether the new token has a different // cost from before, or is new [if so, add into queue]. if (changed && fst_.NumInputEpsilons(arc.nextstate) != 0) @@ -662,5 +656,3 @@ void LatticeSimpleDecoder::PruneCurrentTokens(BaseFloat beam, unordered_map Date: Mon, 13 Nov 2023 11:54:21 +0800 Subject: [PATCH 50/76] Update fix_data_dir.sh --- egs/wsj/s5/utils/fix_data_dir.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index ed4710d0b1f..051715f2b1e 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -54,7 +54,7 @@ function check_sorted { } for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames $utt_extra_files $spk_extra_files; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x From b7e886c13a075bd73a09d515f54fb6a159eb1472 Mon Sep 17 00:00:00 2001 From: Omer Danziger <57575138+Omerdan03@users.noreply.github.com> Date: Thu, 14 Dec 2023 10:18:15 +0200 Subject: [PATCH 51/76] Update COPYING The list of Individual Contributors wasn't really in alphabetical order --- COPYING | 106 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/COPYING b/COPYING index 5a5cab00a29..2b0dbd4243a 100644 --- a/COPYING +++ b/COPYING @@ -57,72 +57,72 @@ License v 2.0 are set forth below. Individual Contributors (in alphabetical order) - Mohit Agarwal - Tanel Alumae - Gilles Boulianne - Lukas Burget - Dogan Can - Guoguo Chen - Gaofeng Cheng + Albert Vernon + Alexander Solovets + Allen Guo + Ariya Rastrow + Arnab Ghoshal Cisco Corporation - Pavel Denisov - Ilya Edrenkin - Ewald Enzinger - Joachim Fainberg Daniel Galvez - Pegah Ghahremani - Arnab Ghoshal - Ondrej Glembek + Daniel Povey + Danijel Korzinek + David Snyder + Dogan Can + Eduardo Silva + Ewald Enzinger + Gaofeng Cheng + Gaurav Kumar + Georg Stemmer + Gilles Boulianne Go Vivace Inc. - Allen Guo - Hossein Hadian - Lv Hang - Mirko Hannemann + Guoguo Chen + Haihua Xu + Hainan Xu Hendy Irawan - Navdeep Jaitly + Hossein Hadian + Ilya Edrenkin + Jan "Yenda" Trmal + Jan Silovsky + Joachim Fainberg Johns Hopkins University - Shiyin Kang - Kirill Katsnelson - Tom Ko - Danijel Korzinek - Gaurav Kumar + Karel Vesely Ke Li + Kirill Katsnelson + Lucas Ondel + Lukas Burget + Lv Hang Matthew Maciejewski - Vimal Manohar - Yajie Miao Microsoft Corporation + Minhua Wu + Mirko Hannemann + Mohit Agarwal + Navdeep Jaitly + Nickolay V. Shmyrev + Omid Sadjadi + Ondrej Glembek + Ondrej Platek + Pavel Denisov + Pawel Swietojanski + Pegah Ghahremani + Peter Smit Petr Motlicek - Xingyu Na - Vincent Nguyen - Lucas Ondel - Vassil Panayotov - Vijayaditya Peddinti + Petr Schwarz Phonexia s.r.o. - Ondrej Platek - Daniel Povey - Yanmin Qian - Ariya Rastrow Saarland University - Omid Sadjadi - Petr Schwarz - Yiwen Shao - Nickolay V. Shmyrev - Jan Silovsky - Eduardo Silva - Peter Smit - David Snyder - Alexander Solovets - Georg Stemmer - Pawel Swietojanski - Jan "Yenda" Trmal - Albert Vernon - Karel Vesely - Yiming Wang Shinji Watanabe - Minhua Wu - Haihua Xu - Hainan Xu + Shiyin Kang + Tanel Alumae + Tom Ko + Vassil Panayotov + Vijayaditya Peddinti + Vimal Manohar + Vincent Nguyen Xiaohui Zhang + Xingyu Na + Yajie Miao + Yanmin Qian + Yiming Wang + Yiwen Shao Other Source Material From 77ffb5556c825adcf22138d387967c627d54c415 Mon Sep 17 00:00:00 2001 From: Ilia Dzenzeliuk <43926347+dzen03@users.noreply.github.com> Date: Sun, 21 Jan 2024 18:08:34 +0300 Subject: [PATCH 52/76] Fix unused but set variable --- src/base/kaldi-error.h | 8 ++++---- src/bin/matrix-sum.cc | 6 ++---- src/bin/vector-sum.cc | 3 +-- src/chainbin/nnet3-chain-copy-egs.cc | 4 +--- src/fstext/pre-determinize-inl.h | 3 --- src/gmm/mle-diag-gmm-test.cc | 7 ++----- src/gmm/mle-full-gmm-test.cc | 7 ++----- src/gmmbin/gmm-acc-mllt-global.cc | 4 +--- src/ivector/ivector-extractor-test.cc | 3 +-- src/kwsbin/kws-search.cc | 2 -- src/latbin/lattice-oracle.cc | 3 +-- src/latbin/lattice-prune.cc | 3 +-- src/latbin/lattice-to-mpe-post.cc | 3 +-- src/latbin/lattice-to-smbr-post.cc | 3 +-- src/matrix/matrix-functions.cc | 2 -- src/nnet2/nnet-compute-discriminative.cc | 3 +-- src/nnet3/nnet-example-utils.cc | 2 -- src/online2bin/apply-cmvn-online.cc | 3 +-- src/online2bin/ivector-extract-online2.cc | 3 +-- src/tree/build-tree-utils.cc | 2 -- 20 files changed, 21 insertions(+), 53 deletions(-) diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h index a9904a752cd..572cbb4effd 100644 --- a/src/base/kaldi-error.h +++ b/src/base/kaldi-error.h @@ -185,12 +185,12 @@ class MessageLogger { #define KALDI_ASSERT(cond) \ do { \ if (cond) \ - (void)0; \ + (void)(cond); \ else \ ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \ } while (0) #else -#define KALDI_ASSERT(cond) (void)0 +#define KALDI_ASSERT(cond) (void)(cond) #endif // Some more expensive asserts only checked if this defined. @@ -198,12 +198,12 @@ class MessageLogger { #define KALDI_PARANOID_ASSERT(cond) \ do { \ if (cond) \ - (void)0; \ + (void)(cond); \ else \ ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \ } while (0) #else -#define KALDI_PARANOID_ASSERT(cond) (void)0 +#define KALDI_PARANOID_ASSERT(cond) (void)(cond) #endif /***** THIRD-PARTY LOG-HANDLER *****/ diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc index 3c93dfd0d39..6aee0c5ce78 100644 --- a/src/bin/matrix-sum.cc +++ b/src/bin/matrix-sum.cc @@ -49,7 +49,7 @@ int32 TypeOneUsage(const ParseOptions &po, } int32 n_utts = 0, n_total_matrices = 0, - n_success = 0, n_missing = 0, n_other_errors = 0; + n_success = 0, n_missing = 0; for (; !matrix_reader1.Done(); matrix_reader1.Next()) { std::string key = matrix_reader1.Key(); @@ -78,7 +78,6 @@ int32 TypeOneUsage(const ParseOptions &po, << matrix_in_fns[i] << " vs " << matrix_out.NumRows() << " by " << matrix_out.NumCols() << " primary matrix, rspecifier:" << matrix_in_fn1; - n_other_errors++; } } else { KALDI_WARN << "No matrix found for utterance " << key << " for " @@ -124,7 +123,7 @@ int32 TypeOneUsageAverage(const ParseOptions &po) { } int32 n_utts = 0, n_total_matrices = 0, - n_success = 0, n_missing = 0, n_other_errors = 0; + n_success = 0, n_missing = 0; for (; !matrix_reader1.Done(); matrix_reader1.Next()) { std::string key = matrix_reader1.Key(); @@ -151,7 +150,6 @@ int32 TypeOneUsageAverage(const ParseOptions &po) { << matrix_in_fns[i] << " vs " << matrix_out.NumRows() << " by " << matrix_out.NumCols() << " primary matrix, rspecifier:" << matrix_in_fn1; - n_other_errors++; } } else { KALDI_WARN << "No matrix found for utterance " << key << " for " diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc index 3e622cafdc7..d03bf671245 100644 --- a/src/bin/vector-sum.cc +++ b/src/bin/vector-sum.cc @@ -52,7 +52,7 @@ int32 TypeOneUsage(const ParseOptions &po) { } int32 n_utts = 0, n_total_vectors = 0, - n_success = 0, n_missing = 0, n_other_errors = 0; + n_success = 0, n_missing = 0; for (; !vector_reader1.Done(); vector_reader1.Next()) { std::string key = vector_reader1.Key(); @@ -75,7 +75,6 @@ int32 TypeOneUsage(const ParseOptions &po) { << "system " << (i + 2) << ", rspecifier: " << vector_in_fns[i] << " vs " << vector_out.Dim() << " primary vector, rspecifier:" << vector_in_fn1; - n_other_errors++; } } else { KALDI_WARN << "No vector found for utterance " << key << " for " diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 0117fe2200f..60a2645b31b 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -347,7 +347,7 @@ int main(int argc, char *argv[]) { // not configurable for now. exclude_names.push_back(std::string("ivector")); - int64 num_read = 0, num_written = 0, num_err = 0; + int64 num_read = 0, num_written = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { const std::string &key = example_reader.Key(); NnetChainExample &eg = example_reader.Value(); @@ -361,7 +361,6 @@ int main(int argc, char *argv[]) { BaseFloat weight = 1.0; if (!egs_weight_reader.HasKey(key)) { KALDI_WARN << "No weight for example key " << key; - num_err++; continue; } weight = egs_weight_reader.Value(key); @@ -371,7 +370,6 @@ int main(int argc, char *argv[]) { if (!eg_output_name_rspecifier.empty()) { if (!output_name_reader.HasKey(key)) { KALDI_WARN << "No new output-name for example key " << key; - num_err++; continue; } std::string new_output_name = output_name_reader.Value(key); diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h index b67b0ba6fa6..ea6608ce38a 100644 --- a/src/fstext/pre-determinize-inl.h +++ b/src/fstext/pre-determinize-inl.h @@ -411,8 +411,6 @@ void PreDeterminize(MutableFst *fst, std::vector d_vec(max_state+1, false); // "done vector". Purely for debugging. - size_t num_extra_det_states = 0; - // (D)(v) while (Q.size() != 0) { @@ -491,7 +489,6 @@ void PreDeterminize(MutableFst *fst, assert(m_map.count(this_pr.first) == 0); m_map[this_pr.first] = k; k++; - num_extra_det_states++; } } else { // Create the set V_t. V_t.insert(this_pr.second); diff --git a/src/gmm/mle-diag-gmm-test.cc b/src/gmm/mle-diag-gmm-test.cc index d1af7725d20..a91832cd254 100644 --- a/src/gmm/mle-diag-gmm-test.cc +++ b/src/gmm/mle-diag-gmm-test.cc @@ -139,12 +139,10 @@ void test_flags_driven_update(const DiagGmm &gmm, // now both models gmm_all_update, gmm_all_update have the same params updated // compute loglike for models for check - double loglike0 = 0.0; double loglike1 = 0.0; double loglike2 = 0.0; for (int32 i = 0; i < feats.NumRows(); i++) { - loglike0 += static_cast( - gmm.LogLikelihood(feats.Row(i))); + gmm.LogLikelihood(feats.Row(i)); loglike1 += static_cast( gmm_all_update.LogLikelihood(feats.Row(i))); loglike2 += static_cast( @@ -366,9 +364,8 @@ UnitTestEstimateDiagGmm() { est_gmm.Resize(gmm->NumGauss(), gmm->Dim(), flags_all); est_gmm.SetZero(flags_all); - float loglike = 0.0; for (size_t i = 0; i < counter; i++) { - loglike += est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F); + est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F); } test_io(*gmm, est_gmm, false, feats); // ASCII mode test_io(*gmm, est_gmm, true, feats); // Binary mode diff --git a/src/gmm/mle-full-gmm-test.cc b/src/gmm/mle-full-gmm-test.cc index 472db88d501..26c5460f024 100644 --- a/src/gmm/mle-full-gmm-test.cc +++ b/src/gmm/mle-full-gmm-test.cc @@ -200,12 +200,10 @@ void test_flags_driven_update(const FullGmm &gmm, // now both models gmm_all_update, gmm_all_update have the same params updated // compute loglike for models for check - double loglike0 = 0.0; double loglike1 = 0.0; double loglike2 = 0.0; for (int32 i = 0; i < feats.NumRows(); i++) { - loglike0 += static_cast( - gmm.LogLikelihood(feats.Row(i))); + gmm.LogLikelihood(feats.Row(i)); loglike1 += static_cast( gmm_all_update.LogLikelihood(feats.Row(i))); loglike2 += static_cast( @@ -462,9 +460,8 @@ UnitTestEstimateFullGmm() { est_gmm.Resize(gmm->NumGauss(), gmm->Dim(), flags_all); est_gmm.SetZero(flags_all); - float loglike = 0.0; for (int32 i = 0; i < counter; i++) { - loglike += est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F); + est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F); } test_io(*gmm, est_gmm, false, feats); test_io(*gmm, est_gmm, true, feats); diff --git a/src/gmmbin/gmm-acc-mllt-global.cc b/src/gmmbin/gmm-acc-mllt-global.cc index bed91c053d3..b6b7a2b5635 100644 --- a/src/gmmbin/gmm-acc-mllt-global.cc +++ b/src/gmmbin/gmm-acc-mllt-global.cc @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) { SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - int32 num_done = 0, num_err = 0; + int32 num_done = 0; for (; !feature_reader.Done(); feature_reader.Next()) { std::string utt = feature_reader.Key(); const Matrix &mat = feature_reader.Value(); @@ -88,7 +88,6 @@ int main(int argc, char *argv[]) { } else { if (!gselect_reader.HasKey(utt)) { KALDI_WARN << "No gselect information for utterance " << utt; - num_err++; continue; } const std::vector > &gselect= gselect_reader.Value(utt); @@ -96,7 +95,6 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Gselect information has wrong size for utterance " << utt << ", " << gselect.size() << " vs. " << mat.NumRows(); - num_err++; continue; } diff --git a/src/ivector/ivector-extractor-test.cc b/src/ivector/ivector-extractor-test.cc index cb08464fbe8..ffd5a2561cc 100644 --- a/src/ivector/ivector-extractor-test.cc +++ b/src/ivector/ivector-extractor-test.cc @@ -94,11 +94,10 @@ void TestIvectorExtraction(const IvectorExtractor &extractor, ivector_dim = extractor.IvectorDim(); Posterior post(num_frames); - double tot_log_like = 0.0; for (int32 t = 0; t < num_frames; t++) { SubVector frame(feats, t); Vector posterior(fgmm.NumGauss(), kUndefined); - tot_log_like += fgmm.ComponentPosteriors(frame, &posterior); + fgmm.ComponentPosteriors(frame, &posterior); for (int32 i = 0; i < posterior.Dim(); i++) post[t].push_back(std::make_pair(i, posterior(i))); } diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc index 8e2b2a84def..c76a5d46eb9 100644 --- a/src/kwsbin/kws-search.cc +++ b/src/kwsbin/kws-search.cc @@ -287,7 +287,6 @@ int main(int argc, char *argv[]) { ArcSort(&index, fst::ILabelCompare()); int32 n_done = 0; - int32 n_fail = 0; for (; !keyword_reader.Done(); keyword_reader.Next()) { std::string key = keyword_reader.Key(); VectorFst keyword = keyword_reader.Value(); @@ -336,7 +335,6 @@ int main(int argc, char *argv[]) { if (result_fst.Final(arc.nextstate) != Weight::One()) { KALDI_WARN << "The resulting FST does not have " << "the expected structure for key " << key; - n_fail++; continue; } diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc index 5f2513131d7..054a0676e37 100644 --- a/src/latbin/lattice-oracle.cc +++ b/src/latbin/lattice-oracle.cc @@ -257,7 +257,7 @@ int main(int argc, char *argv[]) { } int32 n_done = 0, n_fail = 0; - int32 tot_correct = 0, tot_substitutions = 0, + int32 tot_substitutions = 0, tot_insertions = 0, tot_deletions = 0, tot_words = 0; for (; !lattice_reader.Done(); lattice_reader.Next()) { @@ -320,7 +320,6 @@ int main(int argc, char *argv[]) { KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs << " / " << num_words << ", " << insertions << " insertions, " << deletions << " deletions, " << substitutions << " sub ]"; - tot_correct += correct; tot_substitutions += substitutions; tot_insertions += insertions; tot_deletions += deletions; diff --git a/src/latbin/lattice-prune.cc b/src/latbin/lattice-prune.cc index 49399f748e4..d87f5ded28f 100644 --- a/src/latbin/lattice-prune.cc +++ b/src/latbin/lattice-prune.cc @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) { SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); CompactLatticeWriter compact_lattice_writer(lats_wspecifier); - int32 n_done = 0, n_err = 0; + int32 n_done = 0; int64 n_arcs_in = 0, n_arcs_out = 0, n_states_in = 0, n_states_out = 0; @@ -86,7 +86,6 @@ int main(int argc, char *argv[]) { CompactLattice pruned_clat(clat); if (!PruneLattice(beam, &pruned_clat)) { KALDI_WARN << "Error pruning lattice for utterance " << key; - n_err++; } int64 pruned_narcs = NumArcs(pruned_clat), pruned_nstates = pruned_clat.NumStates(); diff --git a/src/latbin/lattice-to-mpe-post.cc b/src/latbin/lattice-to-mpe-post.cc index 7961cc5c438..771399a32a4 100644 --- a/src/latbin/lattice-to-mpe-post.cc +++ b/src/latbin/lattice-to-mpe-post.cc @@ -94,7 +94,7 @@ int main(int argc, char *argv[]) { trans_model.Read(ki.Stream(), binary); } - int32 num_done = 0, num_err = 0; + int32 num_done = 0; double total_lat_frame_acc = 0.0, lat_frame_acc; double total_time = 0, lat_time; @@ -114,7 +114,6 @@ int main(int argc, char *argv[]) { if (!alignments_reader.HasKey(key)) { KALDI_WARN << "No alignment for utterance " << key; - num_err++; } else { const std::vector &alignment = alignments_reader.Value(key); Posterior post; diff --git a/src/latbin/lattice-to-smbr-post.cc b/src/latbin/lattice-to-smbr-post.cc index e2772316954..6b2861b395f 100644 --- a/src/latbin/lattice-to-smbr-post.cc +++ b/src/latbin/lattice-to-smbr-post.cc @@ -95,7 +95,7 @@ int main(int argc, char *argv[]) { trans_model.Read(ki.Stream(), binary); } - int32 num_done = 0, num_err = 0; + int32 num_done = 0; double total_lat_frame_acc = 0.0, lat_frame_acc; double total_time = 0, lat_time; @@ -115,7 +115,6 @@ int main(int argc, char *argv[]) { if (!alignments_reader.HasKey(key)) { KALDI_WARN << "No alignment for utterance " << key; - num_err++; } else { const std::vector &alignment = alignments_reader.Value(key); Posterior post; diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 496c09f5344..6942b220da6 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -669,12 +669,10 @@ void ComputePca(const MatrixBase &X, Nsp.TopEigs(&l, &Vtmp); } - MatrixIndexT num_zeroed = 0; for (MatrixIndexT g = 0; g < G; g++) { if (l(g) < 0.0) { KALDI_WARN << "In PCA, setting element " << l(g) << " to zero."; l(g) = 0.0; - num_zeroed++; } } SortSvd(&l, &Vtmp); // Make sure zero elements are last, this diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc index 65c48097bf9..16d34160508 100644 --- a/src/nnet2/nnet-compute-discriminative.cc +++ b/src/nnet2/nnet-compute-discriminative.cc @@ -296,7 +296,7 @@ void NnetDiscriminativeUpdater::LatticeComputations() { ScalePosterior(eg_.weight, &post); - double tot_num_post = 0.0, tot_den_post = 0.0; + double tot_num_post = 0.0; std::vector > sv_labels; sv_labels.reserve(answers.size()); for (int32 t = 0; t < post.size(); t++) { @@ -304,7 +304,6 @@ void NnetDiscriminativeUpdater::LatticeComputations() { int32 pdf_id = post[t][i].first; BaseFloat weight = post[t][i].second; if (weight > 0.0) { tot_num_post += weight; } - else { tot_den_post -= weight; } MatrixElement elem = {t, pdf_id, weight}; sv_labels.push_back(elem); } diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index facbbb19be0..06278610553 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -673,11 +673,9 @@ void UtteranceSplitter::InitSplits(std::vector > *splits) con vec.push_back(config_.num_frames[i]); if (j > 0) vec.push_back(config_.num_frames[j]); - int32 n = 0; while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) { if (!vec.empty()) // Don't allow the empty vector as a split. splits_set.insert(vec); - n++; vec.push_back(primary_length); std::sort(vec.begin(), vec.end()); } diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc index 06157d0fcdf..615941f760a 100644 --- a/src/online2bin/apply-cmvn-online.cc +++ b/src/online2bin/apply-cmvn-online.cc @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) { BaseFloatMatrixWriter feature_writer(feature_wspecifier); - int32 num_done = 0, num_err = 0; + int32 num_done = 0; int64 tot_t = 0; if (spk2utt_rspecifier != "") { @@ -82,7 +82,6 @@ int main(int argc, char *argv[]) { std::string utt = uttlist[i]; if (!feature_reader.HasKey(utt)) { KALDI_WARN << "No features for utterance " << utt; - num_err++; continue; } const Matrix &feats = feature_reader.Value(utt); diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc index e697de6d15a..eafc0e64124 100644 --- a/src/online2bin/ivector-extract-online2.cc +++ b/src/online2bin/ivector-extract-online2.cc @@ -82,7 +82,7 @@ int main(int argc, char *argv[]) { feature_rspecifier = po.GetArg(2), ivectors_wspecifier = po.GetArg(3); - double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0, + double tot_objf_impr = 0.0, tot_t = 0.0, tot_length = 0.0, tot_length_utt_end = 0.0; int32 num_done = 0, num_err = 0; @@ -166,7 +166,6 @@ int main(int argc, char *argv[]) { } // Update diagnostics. - tot_ubm_loglike += T * ivector_feature.UbmLogLikePerFrame(); tot_objf_impr += T * ivector_feature.ObjfImprPerFrame(); tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0); for (int32 i = 0; i < num_ivectors; i++) diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc index 254d7ec36d8..cf88a408fcb 100644 --- a/src/tree/build-tree-utils.cc +++ b/src/tree/build-tree-utils.cc @@ -538,7 +538,6 @@ EventMap *SplitDecisionTree(const EventMap &input_map, BaseFloat *obj_impr_out, BaseFloat *smallest_split_change_out) { KALDI_ASSERT(num_leaves != NULL && *num_leaves > 0); // can't be 0 or input_map would be empty. - int32 num_empty_leaves = 0; BaseFloat like_impr = 0.0; BaseFloat smallest_split_change = 1.0e+20; std::vector builders; @@ -550,7 +549,6 @@ EventMap *SplitDecisionTree(const EventMap &input_map, builders.resize(split_stats.size()); // size == #leaves. for (size_t i = 0;i < split_stats.size();i++) { EventAnswerType leaf = static_cast(i); - if (split_stats[i].size() == 0) num_empty_leaves++; builders[i] = new DecisionTreeSplitter(leaf, split_stats[i], q_opts); } } From f785fa0b623c14f28bf01dc1bd12534a24ed78bf Mon Sep 17 00:00:00 2001 From: Roland Fehrenbacher Date: Mon, 29 Jan 2024 15:01:33 +0100 Subject: [PATCH 53/76] configure: Fix gcc version check for cuda --- src/configure | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/configure b/src/configure index 37a75a5cade..82e4e3d9149 100755 --- a/src/configure +++ b/src/configure @@ -389,8 +389,8 @@ Either your CUDA is too new or too old." CUSOLVER=true ;; 12_*) - MIN_UNSUPPORTED_GCC_VER="12.2" - MIN_UNSUPPORTED_GCC_VER_NUM=122000 + MIN_UNSUPPORTED_GCC_VER="12.3" + MIN_UNSUPPORTED_GCC_VER_NUM=123000 CUSOLVER=true ;; *) @@ -399,9 +399,10 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ output of either 'nvcc -h' or 'ptxas -h'." ;; esac - (( GCC_VER_NUM < MIN_UNSUPPORTED_GCC_VER_NUM )) || + if [ $GCC_VER_NUM -ge $MIN_UNSUPPORTED_GCC_VER_NUM ]; then failure "CUDA $CUDA_VERSION does not support $CXX (g++-$GCC_VER).\ Only versions strictly older than $MIN_UNSUPPORTED_GCC_VER are supported." + fi case $CUDA_VERSION in [1-8]_* | 9_0) CUSOLVER=false ;; From a74256f1e1c9a9db4661b6a9ffd1900c8eed97d4 Mon Sep 17 00:00:00 2001 From: Paul Guyot Date: Tue, 30 Apr 2024 09:14:10 +0200 Subject: [PATCH 54/76] Workaround for macOS bug with strdup (fixes #4855) Signed-off-by: Paul Guyot --- tools/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/Makefile b/tools/Makefile index 5099c60505a..951280d08f5 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -165,6 +165,11 @@ sph2pipe_v$(SPH2PIPE_VERSION)/Makefile: sph2pipe-$(SPH2PIPE_VERSION).tar.gz rm -rf sph2pipe_v* tar -xmzf sph2pipe-$(SPH2PIPE_VERSION).tar.gz mv sph2pipe-$(SPH2PIPE_VERSION) sph2pipe_v$(SPH2PIPE_VERSION) + # Workaround for macOS bug + if [ `uname` = "Darwin" ]; then \ + sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/sph2pipe.c ; \ + sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/file_headers.c ; \ + fi sph2pipe-$(SPH2PIPE_VERSION).tar.gz: if [ -d "$(DOWNLOAD_DIR)" ]; then \ From a979f2a565b4771b5d197b19c3a1510c049b6f41 Mon Sep 17 00:00:00 2001 From: Paul Guyot Date: Tue, 30 Apr 2024 09:45:27 +0200 Subject: [PATCH 55/76] Patch sctk for macOS picky compiler Signed-off-by: Paul Guyot --- tools/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/Makefile b/tools/Makefile index 5099c60505a..dbe932aa0e6 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -121,6 +121,8 @@ sclite sctk_made: sctk/.compiled sctk/.compiled: sctk rm -f sctk/.compiled + sed -i -e '2s/^/#include \n/' sctk/src/sclite/align.c + sed -i -e '99s/^/int TEXT_set_lang_prof(char *lprof);\n/' sctk/src/sclite/text.h $(SCTK_MKENV) $(MAKE) -C sctk config $(SCTK_MKENV) $(MAKE) -C sctk all doc $(MAKE) -C sctk install From f328393374b4d9c99fafe09fc793e251096ff355 Mon Sep 17 00:00:00 2001 From: danijel3 Date: Sun, 2 Jun 2024 23:11:33 +0200 Subject: [PATCH 56/76] Fix missing FLT_MAX in some CUDA installation scenarios. --- src/cudadecoder/cuda-decoder-kernels.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index 8503182c1f8..e20a7dea15c 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -26,6 +26,10 @@ #include "cuda-decoder-kernels.h" #include "cuda-decoder-kernels-utils.h" +#ifndef FLT_MAX +#define FLT_MAX 340282346638528859811704183484516925440.0f +#endif + namespace kaldi { namespace cuda_decoder { From c4515b07669a9d9e372fdb906c244961e91a32a9 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Mon, 22 Jul 2024 13:42:31 +0200 Subject: [PATCH 57/76] Fix reported issues w.r.t python2.7 and some apple silicone quirks --- tools/extras/check_dependencies.sh | 77 ++++++++++++++++++------------ 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh index 155a376b6e6..12504104b9a 100755 --- a/tools/extras/check_dependencies.sh +++ b/tools/extras/check_dependencies.sh @@ -82,61 +82,73 @@ if ! have libtoolize && ! have glibtoolize; then add_packages libtool fi -if ! have svn; then - echo "$0: subversion is not installed" - add_packages subversion -fi - if ! have awk; then echo "$0: awk is not installed" add_packages gawk fi -pythonok=true +pythonok=false +python3=false +python27=false + + if ! have python2.7; then echo "$0: python2.7 is not installed" - add_packages python27 python2.7 - pythonok=false +else + echo "$0: python2.7 present" + python27=true + pythonok=true fi if ! have python3; then echo "$0: python3 is not installed" add_packages python3 - pythonok=false +else + echo "$0: python3 present" + python3=true + pythonok=true fi ( #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script [ -f ./env.sh ] && . ./env.sh -if $pythonok && ! have python2; then - mkdir -p $PWD/python - echo "$0: python2.7 is installed, but the python2 binary does not exist." \ - "Creating a symlink and adding this to tools/env.sh" - ln -s $(command -v python2.7) $PWD/python/python2 - echo "export PATH=$PWD/python:\${PATH}" >> env.sh -fi - -if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then - rm $PWD/python/python -fi - -if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then - version=$(python 2>&1 --version | awk '{print $2}') - if [[ $version != "2.7"* ]] ; then - echo "$0: WARNING python 2.7 is not the default python. We fixed this by" \ - "adding a correct symlink more prominently on the path." - echo " ... If you really want to use python $version as default, add an" \ +rm -f $PWD/python/python* +if ! [ -f $PWD/python/.use_default_python ]; then + echo "$0: Configuring python" + echo "$0: ... If you really want to avoid this, add an" \ "empty file $PWD/python/.use_default_python and run this script again." - mkdir -p $PWD/python + if $python27 ; then + echo "$0: ... python2.7 found, making it default (python, python2, python2.7)" ln -s $(command -v python2.7) $PWD/python/python - echo "export PATH=$PWD/python:\${PATH}" >> env.sh + ln -s $(command -v python2.7) $PWD/python/python2 + ln -s $(command -v python2.7) $PWD/python/python2.7 + fi + + if $python3 ; then + echo "$0: ... python3 found, making symlink (python3)" + ln -s $(command -v python3) $PWD/python/python3 + if ! $python27 ; then + echo "$0: ... ... python2.7 not found, using python3 as python" + ln -s $(command -v python3) $PWD/python/python + fi + fi +else + echo "$0: Not configuring python(s) -- using system defaults" + if ! have python ; then + echo "$0: WARNING: 'python' executable not present, configuring" + if $python27 ; then + ln -s $(command -v python2.7) $PWD/python/python + elif $python3 ; then + ln -s $(command -v python3) $PWD/python/python + fi fi fi + ) mathlib_missing=false -case $(uname -m) in - x86_64) # Suggest MKL on an Intel64 system (not supported on i?86 hosts). +case "$(uname -m)-$(uname -s)" in + x86_64*) # Suggest MKL on an Intel64 system (not supported on i?86 hosts). # Respect user-supplied MKL_ROOT environment variable. MKL_ROOT="${MKL_ROOT:-/opt/intel/mkl}" # Check the well-known mkl.h file location. @@ -155,6 +167,9 @@ case $(uname -m) in mathlib_missing=true fi ;; + arm64-Darwin) ## Apple Silicon + echo "$0: Relying on Acceleration framework" + ;; *) # Suggest OpenBLAS on other hardware. if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] && ! echo '#include ' | From ed29e165ecb50698abd241129fe1f909953f6375 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Mon, 22 Jul 2024 14:54:49 +0200 Subject: [PATCH 58/76] catch exception by reference so that compiler does not complain --- src/util/parse-options-test.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc index af1fcc00880..ec7491ad9cb 100644 --- a/src/util/parse-options-test.cc +++ b/src/util/parse-options-test.cc @@ -120,7 +120,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My boolean"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -144,7 +144,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My string"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -186,7 +186,7 @@ void UnitTestParseOptions() { po4.Read(argc4, argv4); KALDI_ASSERT(val == "bar"); } - + try { // test error with --float=string int argc4 = 2; const char *argv4[2] = { "program_name", "--option=foo"}; @@ -195,7 +195,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My float"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -208,7 +208,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My int"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -220,7 +220,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My int"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -232,7 +232,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My int"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)xxx."; } @@ -244,7 +244,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My bool"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -258,7 +258,7 @@ void UnitTestParseOptions() { po4.Register("num", &num, "My int32 variable"); po4.Read(argc4, argv4); KALDI_ASSERT(num == 0); - } catch(std::exception e) { + } catch(std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } From 4145e17c12da9e4468a07b3895c84907c131ad6e Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jul 2024 11:32:37 +0200 Subject: [PATCH 59/76] fix tests and address comments --- src/util/kaldi-table-test.cc | 40 ++++++++++++++++++++++------------ src/util/parse-options-test.cc | 16 +++++++------- src/util/stl-utils-test.cc | 6 ++--- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc index 358e33e686a..3613e44fc76 100644 --- a/src/util/kaldi-table-test.cc +++ b/src/util/kaldi-table-test.cc @@ -351,7 +351,8 @@ void UnitTestTableSequentialInt32(bool binary) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); KALDI_ASSERT(v2 == v); } @@ -384,7 +385,8 @@ void UnitTestTableSequentialBool(bool binary) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); KALDI_ASSERT(v2 == v); } @@ -418,7 +420,8 @@ void UnitTestTableSequentialDouble(bool binary) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); if (binary) { KALDI_ASSERT(v2 == v); @@ -462,7 +465,8 @@ void UnitTestTableSequentialDoubleBoth(bool binary, bool read_scp) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); if (binary) { KALDI_ASSERT(v2 == v); @@ -511,7 +515,8 @@ void UnitTestTableSequentialInt32VectorBoth(bool binary, bool read_scp) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); KALDI_ASSERT(v2 == v); } @@ -551,7 +556,8 @@ void UnitTestTableSequentialInt32PairVectorBoth(bool binary, bool read_scp) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); KALDI_ASSERT(v2 == v); } @@ -594,7 +600,8 @@ void UnitTestTableSequentialInt32VectorVectorBoth(bool binary, bool read_scp) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); KALDI_ASSERT(v2 == v); } @@ -641,7 +648,8 @@ void UnitTestTableSequentialInt32Script(bool binary) { k2.push_back(sbr.Key()); v2.push_back(sbr.Value()); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); unlink("tmp.scp"); for (size_t i = 0; i < script.size(); i++) { @@ -684,7 +692,8 @@ void UnitTestTableSequentialDoubleMatrixBoth(bool binary, bool read_scp) { k2.push_back(sbr.Key()); v2.push_back(new Matrix(sbr.Value())); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); if (binary) { for (size_t i = 0; i < v2.size(); i++) @@ -738,7 +747,8 @@ void UnitTestTableSequentialBaseFloatVectorBoth(bool binary, bool read_scp) { k2.push_back(sbr.Key()); v2.push_back(new Vector(sbr.Value())); } - KALDI_ASSERT(sbr.Close()); + ans = sbr.Close(); + KALDI_ASSERT(ans); KALDI_ASSERT(k2 == k); if (binary) { for (size_t i = 0; i < v2.size(); i++) @@ -831,10 +841,11 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp, bool ans = sbr.HasKey(cur_key); KALDI_ASSERT(ans == true); } + auto v2 = sbr.Value(cur_key); if (binary) { - KALDI_ASSERT(value == sbr.Value(cur_key)); + KALDI_ASSERT(value == v2); } else { - KALDI_ASSERT(ApproxEqual(value, sbr.Value(cur_key))); + KALDI_ASSERT(ApproxEqual(value, v2)); } } } @@ -1039,10 +1050,11 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp, bool ans = sbr.HasKey(cur_key); KALDI_ASSERT(ans == true); } + auto v2 = sbr.Value(cur_key); if (binary) { - KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 1.0e-10)); + KALDI_ASSERT(value_ptr->ApproxEqual(v2, 1.0e-10)); } else { - KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 0.01)); + KALDI_ASSERT(value_ptr->ApproxEqual(v2, 0.01)); } } } diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc index ec7491ad9cb..b242130b8c7 100644 --- a/src/util/parse-options-test.cc +++ b/src/util/parse-options-test.cc @@ -120,7 +120,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My boolean"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -144,7 +144,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My string"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -195,7 +195,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My float"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -208,7 +208,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My int"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -220,7 +220,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My int"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -232,7 +232,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My int"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)xxx."; } @@ -244,7 +244,7 @@ void UnitTestParseOptions() { po4.Register("option", &val, "My bool"); po4.Read(argc4, argv4); assert(false); // Should not reach this part of code. - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } @@ -258,7 +258,7 @@ void UnitTestParseOptions() { po4.Register("num", &num, "My int32 variable"); po4.Read(argc4, argv4); KALDI_ASSERT(num == 0); - } catch(std::exception &e) { + } catch(const std::exception &e) { KALDI_LOG << "Failed to read option (this is expected)."; } diff --git a/src/util/stl-utils-test.cc b/src/util/stl-utils-test.cc index 11781e2f938..3a54fc82bac 100644 --- a/src/util/stl-utils-test.cc +++ b/src/util/stl-utils-test.cc @@ -148,9 +148,9 @@ void TestCopyMapValuesToVector() { CopyMapValuesToVector(mp, &v); KALDI_ASSERT(mp.size() == v.size()); int i = 0; - for (std::map::iterator iter = mp.begin(); iter != mp.end(); - iter++) { - KALDI_ASSERT(v[i++] == iter->second); + for (auto iter = mp.begin(); iter != mp.end(); + iter++, i++) { + KALDI_ASSERT(v[i] == iter->second); } } } From 38ea2b1c924d2a5a623aaac700b9ee3f3ab0e953 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jul 2024 13:55:12 +0200 Subject: [PATCH 60/76] disable warning about unused flags msse and msse on Apple Silicon --- src/configure | 2 ++ src/makefiles/darwin_arm64.mk | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/makefiles/darwin_arm64.mk diff --git a/src/configure b/src/configure index 82e4e3d9149..f55e320ff97 100755 --- a/src/configure +++ b/src/configure @@ -1150,6 +1150,8 @@ elif [ "`uname`" == "Darwin" ]; then cat makefiles/darwin_clapack.mk >> kaldi.mk echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work." echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT" + elif [ "`uname -m`" == "arm64" ]; then + cat makefiles/darwin_arm64.mk >> kaldi.mk else cat makefiles/darwin.mk >> kaldi.mk fi diff --git a/src/makefiles/darwin_arm64.mk b/src/makefiles/darwin_arm64.mk new file mode 100644 index 00000000000..149a3d97118 --- /dev/null +++ b/src/makefiles/darwin_arm64.mk @@ -0,0 +1,36 @@ +# Darwin (macOS) configuration + +ifndef DOUBLE_PRECISION +$(error DOUBLE_PRECISION not defined.) +endif +ifndef OPENFSTINC +$(error OPENFSTINC not defined.) +endif +ifndef OPENFSTLIBS +$(error OPENFSTLIBS not defined.) +endif + +CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \ + -Wall -Wno-sign-compare -Wno-unused-local-typedefs \ + -Wno-deprecated-declarations -Winit-self \ + -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \ + -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \ + -pthread \ + -g # -O0 -DKALDI_PARANOID + +ifeq ($(KALDI_FLAVOR), dynamic) +CXXFLAGS += -fPIC +endif + +# Compiler specific flags +COMPILER = $(shell $(CXX) -v 2>&1) +ifeq ($(findstring clang,$(COMPILER)),clang) +# Suppress annoying clang warnings that are perfectly valid per spec. +CXXFLAGS += -Wno-mismatched-tags +else ifeq ($(findstring GCC,$(COMPILER)),GCC) +# Allow implicit conversions between vectors. +CXXFLAGS += -flax-vector-conversions +endif + +LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g +LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl From 3cf3c1a72caec64d01e00546dfdac9a33c805641 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jul 2024 14:53:59 +0200 Subject: [PATCH 61/76] do a full cleanup on apple silicon --- src/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index bc4375e30f6..5036d12b707 100644 --- a/src/Makefile +++ b/src/Makefile @@ -62,14 +62,16 @@ endif # Don't call rm -rf. rmlibdir: +ifeq ($(KALDI_FLAVOR), dynamic) ifneq ($(KALDILIBDIR), ) - -rm -f $(KALDILIBDIR)/*{.so,.a,.o} + -rm -f $(KALDILIBDIR)/*{.so,.a,.o,.dylib} -rmdir 2>/dev/null $(KALDILIBDIR); true else # KALDILIBDIR might have been unset because of reconfigure. Do a best guess. @echo "Something seems wrong. Please re-run configure." @echo "I will continue but the cleanup might not be complete." endif +endif kaldi.mk: @echo "ERROR: kaldi.mk does not exist; run ./configure first."; From 770daa212c5cddb1a559e2ac0906eaf3511b594d Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jul 2024 16:07:03 +0200 Subject: [PATCH 62/76] improve compatibility with C++ standard, esp. C++20 --- src/cudamatrix/cu-array.h | 12 ++++++------ src/cudamatrix/cu-matrix.h | 22 +++++++++++----------- src/cudamatrix/cu-tp-matrix.h | 16 ++++++++-------- src/cudamatrix/cu-vector.h | 2 +- src/matrix/kaldi-matrix.h | 4 ++-- src/matrix/qr.cc | 26 +++++++++++++------------- 6 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h index 84f78f00a91..aaaddad75c8 100644 --- a/src/cudamatrix/cu-array.h +++ b/src/cudamatrix/cu-array.h @@ -105,7 +105,7 @@ class CuArrayBase { protected: /// Default constructor: make it protected so the user cannot /// instantiate this class. - CuArrayBase(): data_(NULL), dim_(0) { } + CuArrayBase(): data_(NULL), dim_(0) { } T *data_; ///< GPU data pointer (if GPU not available, @@ -126,19 +126,19 @@ class CuArray: public CuArrayBase { /// Default constructor, initialized data_ to NULL and dim_ to 0 via /// constructor of CuArrayBase. - CuArray() { } + CuArray() { } /// Constructor with memory initialisation. resize_type may be kSetZero or /// kUndefined. - explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero) + explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero) { Resize(dim, resize_type); } /// Constructor from CPU-based int vector - explicit CuArray(const std::vector &src) { CopyFromVec(src); } + explicit CuArray(const std::vector &src) { CopyFromVec(src); } /// Copy constructor. We don't make this explicit because we want to be able /// to create a std::vector. - CuArray(const CuArray &src) { CopyFromArray(src); } + CuArray(const CuArray &src) { CopyFromArray(src); } /// Destructor ~CuArray() { Destroy(); } @@ -182,7 +182,7 @@ class CuSubArray: public CuArrayBase { /// Constructor as a range of an existing CuArray or CuSubArray. Note: like /// similar constructors in class CuVector and others, it can be used to evade /// 'const' constraints; don't do that. - explicit CuSubArray(const CuArrayBase &src, + explicit CuSubArray(const CuArrayBase &src, MatrixIndexT offset, MatrixIndexT dim); /// Construct from raw pointers diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 3ffe67d8b06..775fecd82c6 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -250,7 +250,7 @@ class CuMatrixBase { template void CopyFromTp(const CuTpMatrix &M, MatrixTransposeType trans = kNoTrans); - + // This function will copy from source rows (start_range, end_range] // if the range is outside of the clamped region then the clamped // row will be replicated across the out of range areas @@ -307,9 +307,9 @@ class CuMatrixBase { void PowAbs(const CuMatrixBase &src, Real power, bool include_sign=false); void Floor(const CuMatrixBase &src, Real floor_val); - + void Ceiling(const CuMatrixBase &src, Real ceiling_val); - + /// This is equivalent to running: /// Floor(src, lower_limit); /// Ceiling(src, upper_limit); @@ -320,7 +320,7 @@ class CuMatrixBase { /// (x < 0 ? exp(x) : x + 1). This function is used /// in our RNNLM training. void ExpSpecial(const CuMatrixBase &src); - + /// Softmax nonlinearity /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row, /// with attention to avoiding overflow or underflow. @@ -333,7 +333,7 @@ class CuMatrixBase { /// Supports in-place operation (i.e. this == &src). void LogSoftMaxPerRow(const CuMatrixBase &src); - + /// Apply the function y = log(1 + exp(x)), to each element. /// Note: the derivative of this function is the sigmoid function. /// This is like a soft ReLU. @@ -439,23 +439,23 @@ class CuMatrixBase { this -> Pow(*this, power); }; - + inline void ApplyPowAbs(Real power, bool include_sign=false) { this -> PowAbs(*this, power, include_sign); }; - + inline void ApplyHeaviside() { this -> Heaviside(*this); }; - + inline void ApplyFloor(Real floor_val) { this -> Floor(*this, floor_val); }; - + inline void ApplyCeiling(Real ceiling_val) { this -> Ceiling(*this, ceiling_val); }; - + inline void ApplyExp() { this -> Exp(*this); }; @@ -924,7 +924,7 @@ class CuSubMatrix: public CuMatrixBase { /// This type of constructor is needed for Range() to work [in CuMatrix base /// class]. Cannot make it explicit or that breaks. - inline CuSubMatrix (const CuSubMatrix &other): + inline CuSubMatrix(const CuSubMatrix &other): CuMatrixBase (other.data_, other.num_rows_, other.num_cols_, other.stride_) {} private: diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h index 8de46ec46f5..4219467f615 100644 --- a/src/cudamatrix/cu-tp-matrix.h +++ b/src/cudamatrix/cu-tp-matrix.h @@ -48,18 +48,18 @@ class CuTpMatrix : public CuPackedMatrix { CuTpMatrix() : CuPackedMatrix() {} explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero) : CuPackedMatrix(r, resize_type) {} - - explicit CuTpMatrix(const TpMatrix &orig) + + explicit CuTpMatrix(const TpMatrix &orig) : CuPackedMatrix(orig) {} // This constructor lacks the "explicit" keyword so that // we can include this class in std::vector. - CuTpMatrix(const CuTpMatrix &orig) + CuTpMatrix(const CuTpMatrix &orig) : CuPackedMatrix(orig) {} - - explicit CuTpMatrix(const CuMatrixBase &orig, + + explicit CuTpMatrix(const CuMatrixBase &orig, MatrixTransposeType trans = kNoTrans); - + ~CuTpMatrix() {} void CopyFromMat(const CuMatrixBase &M, @@ -70,12 +70,12 @@ class CuTpMatrix : public CuPackedMatrix { } void CopyFromTp(const TpMatrix &other) { CuPackedMatrix::CopyFromPacked(other); - } + } void Cholesky(const CuSpMatrix& Orig); void Invert(); CuTpMatrix &operator = (const CuTpMatrix &in); - + protected: inline const TpMatrix &Mat() const { return *(reinterpret_cast* >(this)); diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index f1c32756887..82e1fb47fcb 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -243,7 +243,7 @@ class CuVectorBase { /// Default constructor: make it protected so the user cannot /// instantiate this class. - CuVectorBase(): data_(NULL), dim_(0) { } + CuVectorBase(): data_(NULL), dim_(0) { } Real *data_; ///< GPU data pointer (or regular data pointer ///< if CUDA is not compiled in or we have no GPU). diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index 064edf4237b..b44bf1d934f 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -1006,11 +1006,11 @@ class SubMatrix : public MatrixBase { MatrixIndexT num_cols, MatrixIndexT stride); - ~SubMatrix() {} + ~SubMatrix() {} /// This type of constructor is needed for Range() to work [in Matrix base /// class]. Cannot make it explicit. - SubMatrix (const SubMatrix &other): + SubMatrix(const SubMatrix &other): MatrixBase (other.data_, other.num_cols_, other.num_rows_, other.stride_) {} diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc index 861dead05ba..db1b7359de9 100644 --- a/src/matrix/qr.cc +++ b/src/matrix/qr.cc @@ -57,7 +57,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { if (max_x == 0.0) max_x = 1.0; s = 1.0 / max_x; } - + Real sigma = 0.0; v[0] = 1.0; for (MatrixIndexT i = 1; i < dim; i++) { @@ -73,7 +73,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { v[0] = x1 - mu; } else { v[0] = -sigma / (x1 + mu); - KALDI_ASSERT(KALDI_ISFINITE(v[dim-1])); + KALDI_ASSERT(KALDI_ISFINITE(v[dim-1])); } Real v1 = v[0]; Real v1sq = v1 * v1; @@ -155,11 +155,11 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) { with packed lower-triangular matrices to do it this way. There's also a shift from one-based to zero-based indexing, so the index k is transformed k -> n - k, and a corresponding transpose... - + Let the original *this be A. This algorithms replaces *this with a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q. Caution: Q is transposed vs. Golub and Van Loan. - If Q != NULL it outputs Q. + If Q != NULL it outputs Q. */ template void SpMatrix::Tridiagonalize(MatrixBase *Q) { @@ -195,7 +195,7 @@ void SpMatrix::Tridiagonalize(MatrixBase *Q) { if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this // case we apply them in the opposite order so it's H_n-1 .. H_1, // but also Q is transposed so we really have Q = H_1 .. H_n-1. - // It's a double negative. + // It's a double negative. // Anyway, we left-multiply Q by each one. The H_n would each be // diag(I + beta v v', I) but we don't ever touch the last dims. // We do (in Matlab notation): @@ -309,7 +309,7 @@ void QrStep(MatrixIndexT n, if (k < n-2) { // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again // backwards. - Real &elem_kp2_k = z, + Real &elem_kp2_k = z, &elem_kp2_kp1 = off_diag[k+1]; // Note: elem_kp2_k == z would start off as zero because it's // two off the diagonal, and not been touched yet. Therefore @@ -338,7 +338,7 @@ void QrInternal(MatrixIndexT n, MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters. large_iters = 100 + 2*n; Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0)); - + for (; counter < max_iters; counter++) { // this takes the place of "until // q=n"... we'll break out of the // loop when we converge. @@ -356,7 +356,7 @@ void QrInternal(MatrixIndexT n, off_diag[i] = 0.0; } // The next code works out p, q, and npq which is n - p - q. - // For the definitions of q and p, see Golub and Van Loan; we + // For the definitions of q and p, see Golub and Van Loan; we // partition the n dims into pieces of size (p, n-p-q, q) where // the part of size q is diagonal and the part of size n-p-p is // "unreduced", i.e. has no zero off-diagonal elements. @@ -392,7 +392,7 @@ void QrInternal(MatrixIndexT n, } else { QrStep(npq, diag + p, off_diag + p, static_cast*>(NULL)); - } + } } if (counter == max_iters) { KALDI_WARN << "Failure to converge in QR algorithm. " @@ -490,7 +490,7 @@ void SpMatrix::TopEigs(VectorBase *s, MatrixBase *P, r.AddSpVec(1.0, S, Q.Row(d), 0.0); // r = S * q_d MatrixIndexT counter = 0; - Real end_prod; + Real end_prod = 0; while (1) { // Normally we'll do this loop only once: // we repeat to handle cases where r gets very much smaller // and we want to orthogonalize again. @@ -528,11 +528,11 @@ void SpMatrix::TopEigs(VectorBase *s, MatrixBase *P, } } - Matrix R(lanczos_dim, lanczos_dim); + Matrix R(lanczos_dim, lanczos_dim); R.SetUnit(); T.Qr(&R); // Diagonalizes T. Vector s_tmp(lanczos_dim); - s_tmp.CopyDiagFromSp(T); + s_tmp.CopyDiagFromSp(T); // Now T = R * diag(s_tmp) * R^T. // The next call sorts the elements of s from greatest to least absolute value, @@ -544,7 +544,7 @@ void SpMatrix::TopEigs(VectorBase *s, MatrixBase *P, SubMatrix Rsub(R, 0, eig_dim, 0, lanczos_dim); SubVector s_sub(s_tmp, 0, eig_dim); s->CopyFromVec(s_sub); - + // For working out what to do now, just assume the other eigenvalues were // zero. This is just for purposes of knowing how to get the result, and // not getting things wrongly transposed. From 816d438453fa65e90139676bb932ad06229421f5 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Tue, 23 Jul 2024 16:15:12 +0200 Subject: [PATCH 63/76] make codefactor happier --- src/configure | 2 +- src/cudamatrix/cu-array.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/configure b/src/configure index f55e320ff97..c1e44512af9 100755 --- a/src/configure +++ b/src/configure @@ -1150,7 +1150,7 @@ elif [ "`uname`" == "Darwin" ]; then cat makefiles/darwin_clapack.mk >> kaldi.mk echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work." echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT" - elif [ "`uname -m`" == "arm64" ]; then + elif [ "$(uname -m)" == "arm64" ]; then cat makefiles/darwin_arm64.mk >> kaldi.mk else cat makefiles/darwin.mk >> kaldi.mk diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h index aaaddad75c8..3db44bf4aa5 100644 --- a/src/cudamatrix/cu-array.h +++ b/src/cudamatrix/cu-array.h @@ -111,7 +111,6 @@ class CuArrayBase { T *data_; ///< GPU data pointer (if GPU not available, ///< will point to CPU memory). MatrixIndexT dim_; ///< dimension of the vector - }; /** @@ -123,7 +122,6 @@ class CuArrayBase { template class CuArray: public CuArrayBase { public: - /// Default constructor, initialized data_ to NULL and dim_ to 0 via /// constructor of CuArrayBase. CuArray() { } @@ -172,7 +170,6 @@ class CuArray: public CuArrayBase { /// I/O void Read(std::istream &is, bool binary); void Write(std::ostream &is, bool binary) const; - }; From 3f170d4d78584dfaf584d21ecddb020a73055853 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Thu, 23 Feb 2023 21:23:57 +0300 Subject: [PATCH 64/76] Support openfst-1.7.6 --- src/chain/chain-supervision.cc | 10 ++++------ src/fstext/fstext-utils-inl.h | 12 ++++++------ src/fstext/kaldi-fst-io-inl.h | 2 +- src/fstext/pre-determinize-inl.h | 4 ++-- src/kws/kws-functions.cc | 2 +- src/lat/kaldi-lattice.cc | 4 ++-- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index f8a2c1d11cc..b29000a448c 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -571,9 +571,8 @@ void Supervision::Write(std::ostream &os, bool binary) const { // Write using StdAcceptorCompactFst, making use of the fact that it's an // acceptor. fst::FstWriteOptions write_options(""); - fst::StdCompactAcceptorFst::WriteFst( - fst, fst::AcceptorCompactor(), os, - write_options); + fst::StdCompactAcceptorFst cfst(fst); + cfst.Write(os, write_options); } } else { KALDI_ASSERT(e2e_fsts.size() == num_sequences); @@ -586,9 +585,8 @@ void Supervision::Write(std::ostream &os, bool binary) const { // Write using StdAcceptorCompactFst, making use of the fact that it's an // acceptor. fst::FstWriteOptions write_options(""); - fst::StdCompactAcceptorFst::WriteFst( - e2e_fsts[i], fst::AcceptorCompactor(), os, - write_options); + fst::StdCompactAcceptorFst cfst(e2e_fsts[i]); + cfst.Write(os, write_options); } } WriteToken(os, binary, ""); diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h index 853697387b9..d877c03e1ae 100644 --- a/src/fstext/fstext-utils-inl.h +++ b/src/fstext/fstext-utils-inl.h @@ -374,12 +374,12 @@ void GetSymbols(const SymbolTable &symtab, std::vector *syms_out) { KALDI_ASSERT(syms_out != NULL); syms_out->clear(); - for (SymbolTableIterator iter(symtab); - !iter.Done(); - iter.Next()) { - if (include_eps || iter.Value() != 0) { - syms_out->push_back(iter.Value()); - KALDI_ASSERT(syms_out->back() == iter.Value()); // an integer-range thing. + for (SymbolTable::iterator iter = symtab.begin(); + iter != symtab.end(); + ++iter) { + if (include_eps || iter->Label() != 0) { + syms_out->push_back(iter->Label()); + KALDI_ASSERT(syms_out->back() == iter->Label()); // an integer-range thing. } } } diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h index b6bae4b9dc9..f7bb3a7c2b5 100644 --- a/src/fstext/kaldi-fst-io-inl.h +++ b/src/fstext/kaldi-fst-io-inl.h @@ -44,7 +44,7 @@ void WriteFstKaldi(std::ostream &os, bool binary, bool acceptor = false, write_one = false; FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); - printer.Print(&os, ""); + printer.Print(os, ""); if (os.fail()) KALDI_ERR << "Stream failure detected writing FST to stream"; // Write another newline as a terminating character. The read routine will diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h index ea6608ce38a..7c1b544da4c 100644 --- a/src/fstext/pre-determinize-inl.h +++ b/src/fstext/pre-determinize-inl.h @@ -235,8 +235,8 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix, assert(symTable != NULL); const char *prefix_ptr = prefix.c_str(); size_t prefix_len = strlen(prefix_ptr); // allowed to be zero but not encouraged. - for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) { - const std::string &sym = siter.Symbol(); + for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) { + const std::string &sym = siter->Symbol(); if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) { // has prefix. if (isdigit(sym[prefix_len])) { // we don't allow prefix followed by a digit, as a symbol. // Has at least one digit. diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc index d1d71ce7a42..3e27226f13c 100644 --- a/src/kws/kws-functions.cc +++ b/src/kws/kws-functions.cc @@ -75,7 +75,7 @@ bool ClusterLattice(CompactLattice *clat, unordered_map >::iterator iter; for (iter = head.begin(); iter != head.end(); ++iter) { // For this ilabel, sort all the arcs on time, from first to last. - sort(iter->second.begin(), iter->second.end(), CompareInterval); + std::sort(iter->second.begin(), iter->second.end(), CompareInterval); std::vector tmp; tmp.push_back(iter->second[0]); for (int32 i = 1; i < iter->second.size(); i++) { diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc index 744cc538462..648e67115b7 100644 --- a/src/lat/kaldi-lattice.cc +++ b/src/lat/kaldi-lattice.cc @@ -78,7 +78,7 @@ bool WriteCompactLattice(std::ostream &os, bool binary, fst::FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); - printer.Print(&os, ""); + printer.Print(os, ""); if (os.fail()) KALDI_WARN << "Stream failure detected."; // Write another newline as a terminating character. The read routine will @@ -403,7 +403,7 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) { fst::FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); - printer.Print(&os, ""); + printer.Print(os, ""); if (os.fail()) KALDI_WARN << "Stream failure detected."; // Write another newline as a terminating character. The read routine will From e460d8a921209190568f6e5d11226d00377034d8 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Thu, 23 Feb 2023 21:24:12 +0300 Subject: [PATCH 65/76] Support openfst-1.8.0 --- src/fstext/fstext-utils-inl.h | 2 +- src/fstext/fstext-utils.h | 2 +- src/fstext/lattice-utils-inl.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h index d877c03e1ae..44e641a3f20 100644 --- a/src/fstext/fstext-utils-inl.h +++ b/src/fstext/fstext-utils-inl.h @@ -163,7 +163,7 @@ void RemoveSomeInputSymbols(const std::vector &to_remove, MutableFst *fst) { KALDI_ASSERT_IS_INTEGER_TYPE(I); RemoveSomeInputSymbolsMapper mapper(to_remove); - Map(fst, mapper); + ArcMap(fst, mapper); } template diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h index 5789dbe7cc3..db14ddd3576 100644 --- a/src/fstext/fstext-utils.h +++ b/src/fstext/fstext-utils.h @@ -113,7 +113,7 @@ void PushInLog(VectorFst *fst, uint32 ptype, float delta = kDelta) { template void MinimizeEncoded(VectorFst *fst, float delta = kDelta) { - Map(fst, QuantizeMapper(delta)); + ArcMap(fst, QuantizeMapper(delta)); EncodeMapper encoder(kEncodeLabels | kEncodeWeights, ENCODE); Encode(fst, &encoder); internal::AcceptorMinimize(fst); diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h index c97a538dd1d..5d52ed3aa5a 100644 --- a/src/fstext/lattice-utils-inl.h +++ b/src/fstext/lattice-utils-inl.h @@ -268,7 +268,7 @@ void ConvertFstToLattice( MutableFst > > *ofst) { int32 num_states_cache = 50000; fst::CacheOptions cache_opts(true, num_states_cache); - fst::MapFstOptions mapfst_opts(cache_opts); + fst::ArcMapFstOptions mapfst_opts(cache_opts); StdToLatticeMapper mapper; MapFst >, StdToLatticeMapper > map_fst(ifst, mapper, mapfst_opts); From 5ccce55e04a59f0e5d0e400ef31f008e5940fa07 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Thu, 23 Feb 2023 21:24:18 +0300 Subject: [PATCH 66/76] Support openfst-1.8.1 --- src/fstext/kaldi-fst-io-inl.h | 2 +- src/fstext/lattice-weight.h | 16 ++++++++-------- src/lat/kaldi-lattice.cc | 2 +- src/lat/lattice-functions-transition-model.cc | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h index f7bb3a7c2b5..01047919c22 100644 --- a/src/fstext/kaldi-fst-io-inl.h +++ b/src/fstext/kaldi-fst-io-inl.h @@ -99,7 +99,7 @@ void ReadFstKaldi(std::istream &is, bool binary, fst->DeleteStates(); string line; size_t nline = 0; - string separator = FLAGS_fst_field_separator + "\r\n"; + string separator = FST_FLAGS_fst_field_separator + "\r\n"; while (std::getline(is, line)) { nline++; vector col; diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h index 6e7737a195d..f03ed702588 100644 --- a/src/fstext/lattice-weight.h +++ b/src/fstext/lattice-weight.h @@ -396,8 +396,8 @@ inline bool ApproxEqual(const LatticeWeightTpl &w1, template inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl &w) { LatticeWeightTpl::WriteFloatType(strm, w.Value1()); - CHECK(FLAGS_fst_weight_separator.size() == 1); - strm << FLAGS_fst_weight_separator[0]; // comma by default; + CHECK(FST_FLAGS_fst_weight_separator.size() == 1); + strm << FST_FLAGS_fst_weight_separator[0]; // comma by default; // may or may not be settable from Kaldi programs. LatticeWeightTpl::WriteFloatType(strm, w.Value2()); return strm; @@ -405,9 +405,9 @@ inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl inline std::istream &operator >>(std::istream &strm, LatticeWeightTpl &w1) { - CHECK(FLAGS_fst_weight_separator.size() == 1); + CHECK(FST_FLAGS_fst_weight_separator.size() == 1); // separator defaults to ',' - return w1.ReadNoParen(strm, FLAGS_fst_weight_separator[0]); + return w1.ReadNoParen(strm, FST_FLAGS_fst_weight_separator[0]); } @@ -726,8 +726,8 @@ inline CompactLatticeWeightTpl Divide(const CompactLatticeW template inline std::ostream &operator <<(std::ostream &strm, const CompactLatticeWeightTpl &w) { strm << w.Weight(); - CHECK(FLAGS_fst_weight_separator.size() == 1); - strm << FLAGS_fst_weight_separator[0]; // comma by default. + CHECK(FST_FLAGS_fst_weight_separator.size() == 1); + strm << FST_FLAGS_fst_weight_separator[0]; // comma by default. for(size_t i = 0; i < w.String().size(); i++) { strm << w.String()[i]; if (i+1 < w.String().size()) @@ -743,8 +743,8 @@ inline std::istream &operator >>(std::istream &strm, CompactLatticeWeightTpl col; diff --git a/src/lat/lattice-functions-transition-model.cc b/src/lat/lattice-functions-transition-model.cc index 6172610dca0..a8cd7b7e2dd 100644 --- a/src/lat/lattice-functions-transition-model.cc +++ b/src/lat/lattice-functions-transition-model.cc @@ -248,13 +248,13 @@ bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info, int32 num_paths = 5, seed = Rand(), max_path_length = -1; BaseFloat delta = 0.2; // some lattices have large costs -> use large delta. - FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi + FST_FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi // verbose level. if (!RandEquivalent(clat, aligned_clat, num_paths, delta, seed, max_path_length)) { KALDI_WARN << "Equivalence test failed during lattice alignment."; return false; } - FLAGS_v = 0; + FST_FLAGS_v = 0; return (num_err == 0); } From 9a213bed4d9a89e7134b3bedf7fea16538b90382 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Thu, 23 Feb 2023 21:24:25 +0300 Subject: [PATCH 67/76] Support openfst-1.8.2 --- src/base/kaldi-types.h | 43 +++++++-------------------- src/fstext/lattice-utils-inl.h | 2 +- src/kws/kws-functions.cc | 4 +-- src/kws/kws-functions2.cc | 2 +- src/lat/arctic-weight.h | 2 +- src/lat/determinize-lattice-pruned.cc | 6 ++-- src/lat/minimize-lattice.cc | 2 +- src/lat/push-lattice.cc | 4 +-- src/lat/sausages.cc | 2 +- src/nnet3/nnet-batch-compute.cc | 2 +- src/online/online-tcp-source.cc | 2 +- src/rnnlm/rnnlm-test-utils.cc | 2 +- src/tree/tree-renderer.cc | 4 +-- 13 files changed, 28 insertions(+), 49 deletions(-) diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h index 7ebf4f85386..68d5578a5fb 100644 --- a/src/base/kaldi-types.h +++ b/src/base/kaldi-types.h @@ -39,37 +39,16 @@ typedef float BaseFloat; // we find in the future lacks stdint.h #include -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file -#include - -namespace kaldi { - using ::int16; - using ::int32; - using ::int64; - using ::uint16; - using ::uint32; - using ::uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef float float32; +typedef double double64; #endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h index 5d52ed3aa5a..03ac9947c5c 100644 --- a/src/fstext/lattice-utils-inl.h +++ b/src/fstext/lattice-utils-inl.h @@ -270,7 +270,7 @@ void ConvertFstToLattice( fst::CacheOptions cache_opts(true, num_states_cache); fst::ArcMapFstOptions mapfst_opts(cache_opts); StdToLatticeMapper mapper; - MapFst >, + ArcMapFst >, StdToLatticeMapper > map_fst(ifst, mapper, mapfst_opts); *ofst = map_fst; } diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc index 3e27226f13c..e6819562f82 100644 --- a/src/kws/kws-functions.cc +++ b/src/kws/kws-functions.cc @@ -175,7 +175,7 @@ bool CreateFactorTransducer(const CompactLattice &clat, // Now we map the CompactLattice to VectorFst. We drop the // alignment information and only keep the negated log-probs - Map(clat, factor_transducer, CompactLatticeToKwsProductFstMapper()); + ArcMap(clat, factor_transducer, CompactLatticeToKwsProductFstMapper()); // Now do the weight pushing manually on the CompactLattice format. Note that // the alphas and betas in Kaldi are stored as the log-probs, not the negated @@ -366,7 +366,7 @@ void MaybeDoSanityCheck(const KwsProductFst &product_transducer) { if (GetVerboseLevel() < 2) return; KwsLexicographicFst index_transducer; - Map(product_transducer, + ArcMap(product_transducer, &index_transducer, KwsProductFstToKwsLexicographicFstMapper()); diff --git a/src/kws/kws-functions2.cc b/src/kws/kws-functions2.cc index 71f5583af19..9e610d2054e 100644 --- a/src/kws/kws-functions2.cc +++ b/src/kws/kws-functions2.cc @@ -92,7 +92,7 @@ void DoFactorMerging(KwsProductFst *factor_transducer, Decode(&dest_transducer, encoder); - Map(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper()); + ArcMap(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper()); } void DoFactorDisambiguation(KwsLexicographicFst *index_transducer) { diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h index 5c0c6d3c416..39775ac8950 100644 --- a/src/lat/arctic-weight.h +++ b/src/lat/arctic-weight.h @@ -50,7 +50,7 @@ class ArcticWeightTpl : public FloatWeightTpl { static const std::string &Type() { static const std::string type = std::string("arctic") + - FloatWeightTpl::GetPrecisionString(); + std::string(FloatWeightTpl::GetPrecisionString()); return type; } diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc index dbdd9af4645..ff3d65d57f3 100644 --- a/src/lat/determinize-lattice-pruned.cc +++ b/src/lat/determinize-lattice-pruned.cc @@ -1499,7 +1499,7 @@ bool DeterminizeLatticePhonePrunedWrapper( } ILabelCompare ilabel_comp; ArcSort(ifst, ilabel_comp); - ans = DeterminizeLatticePhonePruned( + ans = DeterminizeLatticePhonePruned( trans_model, ifst, beam, ofst, opts); Connect(ofst); return ans; @@ -1523,7 +1523,7 @@ bool DeterminizeLatticePruned( DeterminizeLatticePrunedOptions opts); template -bool DeterminizeLatticePhonePruned( +bool DeterminizeLatticePhonePruned( const kaldi::TransitionInformation &trans_model, const ExpandedFst &ifst, double prune, @@ -1531,7 +1531,7 @@ bool DeterminizeLatticePhonePruned( DeterminizeLatticePhonePrunedOptions opts); template -bool DeterminizeLatticePhonePruned( +bool DeterminizeLatticePhonePruned( const kaldi::TransitionInformation &trans_model, MutableFst *ifst, double prune, diff --git a/src/lat/minimize-lattice.cc b/src/lat/minimize-lattice.cc index ada90efadce..416f1e62e93 100644 --- a/src/lat/minimize-lattice.cc +++ b/src/lat/minimize-lattice.cc @@ -279,7 +279,7 @@ bool MinimizeCompactLattice( // Instantiate for CompactLattice type. template -bool MinimizeCompactLattice( +bool MinimizeCompactLattice( MutableFst *clat, float delta); diff --git a/src/lat/push-lattice.cc b/src/lat/push-lattice.cc index f4eb322d002..38a990d74d3 100644 --- a/src/lat/push-lattice.cc +++ b/src/lat/push-lattice.cc @@ -280,11 +280,11 @@ bool PushCompactLatticeWeights( // Instantiate for CompactLattice. template -bool PushCompactLatticeStrings( +bool PushCompactLatticeStrings( MutableFst *clat); template -bool PushCompactLatticeWeights( +bool PushCompactLatticeWeights( MutableFst *clat); } // namespace fst diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc index b851bc3604c..03b384f93f1 100644 --- a/src/lat/sausages.cc +++ b/src/lat/sausages.cc @@ -325,7 +325,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) { // paper (i.e. just one final state). // Topologically sort the lattice, if not already sorted. - kaldi::uint64 props = clat->Properties(fst::kFstProperties, false); + uint64 props = clat->Properties(fst::kFstProperties, false); if (!(props & fst::kTopSorted)) { if (fst::TopSort(clat) == false) KALDI_ERR << "Cycles detected in lattice."; diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc index 0e07834ed3d..fd84c4e56fe 100644 --- a/src/nnet3/nnet-batch-compute.cc +++ b/src/nnet3/nnet-batch-compute.cc @@ -1503,7 +1503,7 @@ NnetBatchDecoder::~NnetBatchDecoder() { } // Print diagnostics. - kaldi::int64 input_frame_count = + int64 input_frame_count = frame_count_ * computer_->GetOptions().frame_subsampling_factor; int32 num_threads = static_cast(decode_threads_.size()); diff --git a/src/online/online-tcp-source.cc b/src/online/online-tcp-source.cc index 6d63493b4bd..8421073d559 100644 --- a/src/online/online-tcp-source.cc +++ b/src/online/online-tcp-source.cc @@ -24,7 +24,7 @@ namespace kaldi { -typedef kaldi::int32 int32; +typedef int32 int32; OnlineTcpVectorSource::OnlineTcpVectorSource(int32 socket) : socket_desc(socket), diff --git a/src/rnnlm/rnnlm-test-utils.cc b/src/rnnlm/rnnlm-test-utils.cc index 32e8b5a4236..f415f257a06 100644 --- a/src/rnnlm/rnnlm-test-utils.cc +++ b/src/rnnlm/rnnlm-test-utils.cc @@ -78,7 +78,7 @@ void ConvertToInteger( for (int i = 0; i < string_sentences.size(); i++) { (*int_sentences)[i].resize(string_sentences[i].size()); for (int j = 0; j < string_sentences[i].size(); j++) { - kaldi::int64 key = symbol_table.Find(string_sentences[i][j]); + int64 key = symbol_table.Find(string_sentences[i][j]); KALDI_ASSERT(key != -1); // fst::kNoSymbol (*int_sentences)[i][j] = static_cast(key); } diff --git a/src/tree/tree-renderer.cc b/src/tree/tree-renderer.cc index bbaa5cda162..8e3b463fe7a 100644 --- a/src/tree/tree-renderer.cc +++ b/src/tree/tree-renderer.cc @@ -67,7 +67,7 @@ TreeRenderer::MakeEdgeLabel(const EventKeyType &key, oss << ", "; if (key != kPdfClass) { std::string phone = - phone_syms_.Find(static_cast(*child)); + phone_syms_.Find(static_cast(*child)); if (phone.empty()) KALDI_ERR << "No phone found for Phone ID " << *child; oss << phone; @@ -137,7 +137,7 @@ void TreeRenderer::RenderTable(const EventType *query, int32 id) { ExpectToken(is_, binary_, "NULL"); // consume the invalid/NULL entry continue; } - std::string phone = phone_syms_.Find(static_cast(t)); + std::string phone = phone_syms_.Find(static_cast(t)); if (phone.empty()) KALDI_ERR << "Phone ID found in a TableEventMap, but not in the " << "phone symbol table! ID: " << t; From 122a3f239ed2f24271eb61b9aa3060fa06b820ac Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Thu, 25 Jul 2024 09:57:49 +0200 Subject: [PATCH 68/76] make nonconst catches const (#4926) --- src/hmm/posterior.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc index 860a979a0ce..bce0c84ad79 100644 --- a/src/hmm/posterior.cc +++ b/src/hmm/posterior.cc @@ -146,7 +146,7 @@ bool PosteriorHolder::Read(std::istream &is) { try { ReadPosterior(is, is_binary, &t_); return true; - } catch (std::exception &e) { + } catch (const std::exception &e) { KALDI_WARN << "Exception caught reading table of posteriors. " << e.what(); t_.clear(); return false; @@ -207,7 +207,7 @@ bool GaussPostHolder::Read(std::istream &is) { } } return true; - } catch (std::exception &e) { + } catch (const std::exception &e) { KALDI_WARN << "Exception caught reading table of posteriors. " << e.what(); t_.clear(); return false; From 57efb6b5e6baf55538864bcd9f48c822b8064f09 Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Thu, 25 Jul 2024 11:19:09 +0200 Subject: [PATCH 69/76] add support for later openfst versions --- src/base/kaldi-error-test.cc | 2 +- src/base/kaldi-types.h | 14 +++++++ src/bin/phones-to-prons.cc | 3 +- src/configure | 10 ++++- src/fstext/context-fst-test.cc | 6 ++- src/fstext/determinize-lattice-test.cc | 14 ++++--- src/fstext/determinize-star-test.cc | 47 +++++++++++----------- src/fstext/factor-test.cc | 5 ++- src/fstext/fstext-lib.h | 3 ++ src/fstext/fstext-utils-inl.h | 11 +++++ src/fstext/fstext-utils-test.cc | 8 ++-- src/fstext/kaldi-fst-io-inl.h | 5 ++- src/fstext/kaldi-fst-io.h | 1 + src/fstext/lattice-utils-test.cc | 22 +++++----- src/fstext/lattice-weight.h | 1 + src/fstext/openfst_compat.h | 42 +++++++++++++++++++ src/fstext/pre-determinize-inl.h | 5 +++ src/fstext/pre-determinize-test.cc | 19 +++++---- src/fstext/prune-special-test.cc | 8 ++-- src/fstext/push-special-test.cc | 6 ++- src/fstext/remove-eps-local-test.cc | 9 +++-- src/fstext/table-matcher-test.cc | 18 +++++---- src/fstext/table-matcher.h | 2 +- src/fstext/trivial-factor-weight-test.cc | 17 ++++---- src/kwsbin/kws-search.cc | 2 + src/lat/determinize-lattice-pruned-test.cc | 14 ++++--- src/lat/kaldi-lattice.cc | 9 ++++- src/lat/push-lattice-test.cc | 6 ++- src/latbin/lattice-oracle.cc | 2 + src/makefiles/android_openblas.mk | 3 +- src/makefiles/cygwin.mk | 3 +- src/makefiles/darwin.mk | 3 +- src/makefiles/darwin_arm64.mk | 3 +- src/makefiles/darwin_clapack.mk | 3 +- src/makefiles/linux_atlas.mk | 3 +- src/makefiles/linux_atlas_arm.mk | 3 +- src/makefiles/linux_atlas_ppc64le.mk | 3 +- src/makefiles/linux_clapack.mk | 3 +- src/makefiles/linux_clapack_arm.mk | 3 +- src/makefiles/linux_openblas.mk | 3 +- src/makefiles/linux_openblas_aarch64.mk | 3 +- src/makefiles/linux_openblas_arm.mk | 3 +- src/makefiles/linux_openblas_ppc64le.mk | 3 +- src/makefiles/linux_x86_64_mkl.mk | 3 +- tools/Makefile | 2 +- 45 files changed, 246 insertions(+), 112 deletions(-) create mode 100644 src/fstext/openfst_compat.h diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc index 31440edf3f9..68ef224b5f5 100644 --- a/src/base/kaldi-error-test.cc +++ b/src/base/kaldi-error-test.cc @@ -76,7 +76,7 @@ int main() { kaldi::UnitTestError(); KALDI_ASSERT(0); // should not happen. exit(1); - } catch (kaldi::KaldiFatalError &e) { + } catch (const kaldi::KaldiFatalError &e) { std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n"; } } diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h index 68d5578a5fb..6d96ecf2b75 100644 --- a/src/base/kaldi-types.h +++ b/src/base/kaldi-types.h @@ -39,6 +39,7 @@ typedef float BaseFloat; // we find in the future lacks stdint.h #include +#if OPENFST_VER >= 10800 typedef int8_t int8; typedef int16_t int16; typedef int32_t int32; @@ -50,5 +51,18 @@ typedef uint32_t uint32; typedef uint64_t uint64; typedef float float32; typedef double double64; +#else +#include +#endif +namespace kaldi { + using ::int16; + using ::int32; + using ::int64; + using ::uint16; + using ::uint32; + using ::uint64; + typedef float float32; + typedef double double64; +} // end namespace kaldi #endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc index 0d7ab12c232..535c18365ed 100644 --- a/src/bin/phones-to-prons.cc +++ b/src/bin/phones-to-prons.cc @@ -172,7 +172,8 @@ int main(int argc, char *argv[]) { if (g_kaldi_verbose_level >= 2) { KALDI_LOG << "phn2word FST is below:"; fst::FstPrinter fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cerr, "standard error"); + printer_print(std::cerr, fstprinter, "standard error"); + //fstprinter.Print(&std::cerr, "standard error"); KALDI_LOG << "phone sequence is: "; for (size_t i = 0; i < phones.size(); i++) std::cerr << phones[i] << ' '; diff --git a/src/configure b/src/configure index c1e44512af9..3743c31f76b 100755 --- a/src/configure +++ b/src/configure @@ -39,7 +39,7 @@ # This should be incremented after any significant change to the configure # script, i.e. any change affecting kaldi.mk or the build system as a whole. -CONFIGURE_VERSION=14 +CONFIGURE_VERSION=15 # We support bash version 3.2 (Macs still ship with this version as of 2019) # and above. @@ -1024,6 +1024,14 @@ OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d" if [ $OPENFST_VER_NUM -lt 10600 ]; then failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)" fi + +if [ $OPENFST_VER_NUM -lt 10800 ]; then + echo "CXXLANGVERSION = c++14" +else + echo "CXXLANGVERSION = c++17" +fi >> kaldi.mk + +echo "OPENFSTVER = $OPENFST_VER_NUM" >> kaldi.mk echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk if $static_fst ; then OPENFSTLIBS="$FSTROOT/lib/libfst.a" diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc index 65da1bb0797..2589c5c344e 100644 --- a/src/fstext/context-fst-test.cc +++ b/src/fstext/context-fst-test.cc @@ -23,6 +23,8 @@ #include "util/kaldi-io.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { using std::vector; @@ -196,7 +198,7 @@ static void TestContextFst(bool verbose, bool use_matcher) { std::cout << "Sequence FST is:\n"; { // Try to print the fst. FstPrinter fstprinter(*f, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } } @@ -224,7 +226,7 @@ static void TestContextFst(bool verbose, bool use_matcher) { std::cout << "Composed FST is:\n"; { // Try to print the fst. FstPrinter fstprinter(fst_composed, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } } diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc index 886aa4cc1b9..ae902021c7d 100644 --- a/src/fstext/determinize-lattice-test.cc +++ b/src/fstext/determinize-lattice-test.cc @@ -22,6 +22,8 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { using std::vector; using std::cout; @@ -94,7 +96,7 @@ template void TestDeterminizeLattice() { std::cout << "FST before lattice-determinizing is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst det_fst; try { @@ -106,7 +108,7 @@ template void TestDeterminizeLattice() { std::cout << "FST after lattice-determinizing is:\n"; { FstPrinter fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic); // OK, now determinize it a different way and check equivalence. @@ -117,7 +119,7 @@ template void TestDeterminizeLattice() { std::cout << "Compact FST is:\n"; { FstPrinter fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if (kaldi::Rand() % 2 == 1) ConvertLattice(det_fst, &compact_det_fst, false); @@ -128,7 +130,7 @@ template void TestDeterminizeLattice() { std::cout << "Compact version of determinized FST is:\n"; { FstPrinter fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/)); @@ -149,14 +151,14 @@ template void TestDeterminizeLattice2() { std::cout << "FST before lattice-determinizing is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst; DeterminizeLattice(*fst, &ofst); std::cout << "FST after lattice-determinizing is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } delete fst; } diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc index 814e6a38d9b..c3fabb8a21e 100644 --- a/src/fstext/determinize-star-test.cc +++ b/src/fstext/determinize-star-test.cc @@ -24,6 +24,7 @@ #include "fstext/trivial-factor-weight.h" #include "fstext/fst-test-utils.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -38,7 +39,7 @@ template void TestDeterminizeGeneral() { std::cout << "FST before determinizing is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst; try { @@ -46,7 +47,7 @@ template void TestDeterminizeGeneral() { std::cout << "FST after determinizing is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/)); } catch (...) { @@ -101,7 +102,7 @@ template void TestDeterminize() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -109,7 +110,7 @@ template void TestDeterminize() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -122,7 +123,7 @@ template void TestDeterminize() { std::cout <<" printing after predeterminization\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -138,7 +139,7 @@ template void TestDeterminize() { std::cout <<" printing after epsilon removal\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst_orig; VectorFst ofst_star; @@ -157,14 +158,14 @@ template void TestDeterminize() { { std::cout <<" printing after determinization [baseline]\n"; FstPrinter fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic); } { std::cout <<" printing after determinization [star]\n"; FstPrinter fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic); } @@ -174,7 +175,7 @@ template void TestDeterminize() { std::cout <<" printing after removing "< fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Checking equivalent to original FST.\n"; @@ -242,7 +243,7 @@ template void TestPush() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -250,7 +251,7 @@ template void TestPush() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -267,7 +268,7 @@ template void TestPush() { std::cout <<" printing after pushing\n"; { FstPrinter fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(*fst, fst_pushed, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/)); @@ -320,7 +321,7 @@ template void TestMinimize() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -328,7 +329,7 @@ template void TestMinimize() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -341,7 +342,7 @@ template void TestMinimize() { std::cout <<" printing after predeterminization\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -357,7 +358,7 @@ template void TestMinimize() { std::cout <<" printing after epsilon removal\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst_orig; VectorFst ofst_star; @@ -370,7 +371,7 @@ template void TestMinimize() { { std::cout <<" printing after determinization [baseline]\n"; FstPrinter fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -382,7 +383,7 @@ template void TestMinimize() { { std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n"; FstPrinter > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -392,7 +393,7 @@ template void TestMinimize() { { std::cout <<" printing after pushing weights [in gallic]\n"; FstPrinter > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -401,7 +402,7 @@ template void TestMinimize() { { std::cout <<" printing after minimization [in gallic]\n"; FstPrinter > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } printf("Converting gallic back to regular [my approach]\n"); @@ -410,7 +411,7 @@ template void TestMinimize() { { std::cout <<" printing factor-weight FST\n"; FstPrinter > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } Map(fwfst, &ofst_star, FromGallicMapper()); @@ -418,7 +419,7 @@ template void TestMinimize() { { std::cout <<" printing after converting back to regular FST\n"; FstPrinter fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } } @@ -431,7 +432,7 @@ template void TestMinimize() { std::cout <<" printing after removing "< fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Checking equivalent to original FST.\n"; diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc index 687d0ad59b3..d58dbfa539c 100644 --- a/src/fstext/factor-test.cc +++ b/src/fstext/factor-test.cc @@ -23,6 +23,7 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -79,7 +80,7 @@ template static void TestFactor() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(&fst); @@ -87,7 +88,7 @@ template static void TestFactor() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if (fst.Start() == kNoStateId) return; // "Connect" made it empty. diff --git a/src/fstext/fstext-lib.h b/src/fstext/fstext-lib.h index bdb8ff730e5..03c8e5861dd 100644 --- a/src/fstext/fstext-lib.h +++ b/src/fstext/fstext-lib.h @@ -20,6 +20,9 @@ #ifndef KALDI_FSTEXT_FSTEXT_LIB_H_ #define KALDI_FSTEXT_FSTEXT_LIB_H_ #include "fst/fstlib.h" + +#include "fstext/openfst_compat.h" + #include "fstext/context-fst.h" #include "fstext/determinize-star.h" #include "fstext/factor.h" diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h index 44e641a3f20..fb3a637bc19 100644 --- a/src/fstext/fstext-utils-inl.h +++ b/src/fstext/fstext-utils-inl.h @@ -374,6 +374,7 @@ void GetSymbols(const SymbolTable &symtab, std::vector *syms_out) { KALDI_ASSERT(syms_out != NULL); syms_out->clear(); +#if OPENFST_VER >= 10800 for (SymbolTable::iterator iter = symtab.begin(); iter != symtab.end(); ++iter) { @@ -382,6 +383,16 @@ void GetSymbols(const SymbolTable &symtab, KALDI_ASSERT(syms_out->back() == iter->Label()); // an integer-range thing. } } +#else + for (SymbolTableIterator iter(symtab); + !iter.Done(); + iter.Next()) { + if (include_eps || iter.Value() != 0) { + syms_out->push_back(iter.Value()); + KALDI_ASSERT(syms_out->back() == iter.Value()); // an integer-range thing. + } + } +#endif } template diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc index 4ce296f093a..460e49c7dec 100644 --- a/src/fstext/fstext-utils-test.cc +++ b/src/fstext/fstext-utils-test.cc @@ -23,6 +23,8 @@ #include "util/stl-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { using std::vector; @@ -140,7 +142,7 @@ template void TestSafeDeterminizeWrapper() { // also tests SafeDete std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -148,7 +150,7 @@ template void TestSafeDeterminizeWrapper() { // also tests SafeDete std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -362,7 +364,7 @@ void TestEqualAlign() { template void Print(const Fst &fst, std::string message) { std::cout << message << "\n"; FstPrinter fstprinter(fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h index 01047919c22..3baa5b95c9c 100644 --- a/src/fstext/kaldi-fst-io-inl.h +++ b/src/fstext/kaldi-fst-io-inl.h @@ -24,6 +24,8 @@ #include "util/text-utils.h" +#include "fstext/openfst_compat.h" + namespace fst { @@ -44,7 +46,8 @@ void WriteFstKaldi(std::ostream &os, bool binary, bool acceptor = false, write_one = false; FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); - printer.Print(os, ""); + //printer.Print(&os, ""); + printer_print(os, printer, ""); if (os.fail()) KALDI_ERR << "Stream failure detected writing FST to stream"; // Write another newline as a terminating character. The read routine will diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h index a45920936ec..3c34f4b4787 100644 --- a/src/fstext/kaldi-fst-io.h +++ b/src/fstext/kaldi-fst-io.h @@ -26,6 +26,7 @@ #include #include #include "base/kaldi-common.h" +#include "fstext/openfst_compat.h" // Some functions for writing Fsts. // I/O for FSTs is a bit of a mess, and not very well integrated with Kaldi's diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc index aa931d47d07..6f1d2747cc1 100644 --- a/src/fstext/lattice-utils-test.cc +++ b/src/fstext/lattice-utils-test.cc @@ -21,6 +21,8 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { template void TestConvert(bool invert) { @@ -31,7 +33,7 @@ template void TestConvert(bool invert) { std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst; ConvertLattice(*fst, &ofst, invert); @@ -39,14 +41,14 @@ template void TestConvert(bool invert) { std::cout << "FST after converting is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst origfst; ConvertLattice(ofst, &origfst, invert); std::cout << "FST after back conversion is:\n"; { FstPrinter fstprinter(origfst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/)); @@ -67,7 +69,7 @@ template void TestShortestPath() { std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst cfst; ConvertLattice(*fst, &cfst, false); // invert == false @@ -205,7 +207,7 @@ template void TestConvertPair(bool invert) { /*std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst ofst; ConvertLattice(*fst, &ofst, invert); @@ -213,14 +215,14 @@ template void TestConvertPair(bool invert) { /*std::cout << "FST after converting is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst origfst; ConvertLattice(ofst, &origfst, invert); /*std::cout << "FST after back conversion is:\n"; { FstPrinter fstprinter(origfst, NULL, NULL, NULL, false, true); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/)); @@ -260,7 +262,7 @@ template void TestScalePair(bool invert) { /*std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst ofst; ConvertLattice(*fst, &ofst, invert); @@ -268,7 +270,7 @@ template void TestScalePair(bool invert) { /*std::cout << "FST after converting and scaling is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst origfst; ConvertLattice(ofst, &origfst, invert); @@ -276,7 +278,7 @@ template void TestScalePair(bool invert) { /*std::cout << "FST after back conversion and scaling is:\n"; { FstPrinter fstprinter(origfst, NULL, NULL, NULL, false, true); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ // If RandEquivalent doesn't work, it could be due to a nasty issue related to the use // of exact floating-point comparisons in the Plus function of LatticeWeight. diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h index f03ed702588..1396764000a 100644 --- a/src/fstext/lattice-weight.h +++ b/src/fstext/lattice-weight.h @@ -23,6 +23,7 @@ #include "fst/fstlib.h" #include "base/kaldi-common.h" +#include "fstext/openfst_compat.h" namespace fst { diff --git a/src/fstext/openfst_compat.h b/src/fstext/openfst_compat.h new file mode 100644 index 00000000000..251d3f893c5 --- /dev/null +++ b/src/fstext/openfst_compat.h @@ -0,0 +1,42 @@ +#ifndef KALDI_FSTEXT_OPENFST_COMPAT_H +#define KALDI_FSTEXT_OPENFST_COMPAT_H + + +#if OPENFST_VER < 10800 +#define FST_FLAGS_fst_weight_separator FLAGS_fst_weight_separator +#define FST_FLAGS_fst_field_separator FLAGS_fst_field_separator +#define FST_FLAGS_v FLAGS_v + +#endif + +namespace fst { +#if OPENFST_VER >= 10800 + + +template +auto Map(Args&&... args) -> decltype(ArcMap(std::forward(args)...)) { + return ArcMap(std::forward(args)...); +} + +using MapFstOptions=ArcMapFstOptions; + +template +using MapFst = ArcMapFst; + +template +void printer_print(Stream &os, Printer &printer, const std::string &s) { + printer.Print(os, s); +} + +#else + +template +void printer_print(Stream &os, Printer &printer, const std::string &s) { + printer.Print(&os, s); +} + +#endif + +} // namespace fst + +#endif //KALDI_FSTEXT_OPENFST_COMPAT_H diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h index 7c1b544da4c..45e1a82279a 100644 --- a/src/fstext/pre-determinize-inl.h +++ b/src/fstext/pre-determinize-inl.h @@ -235,8 +235,13 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix, assert(symTable != NULL); const char *prefix_ptr = prefix.c_str(); size_t prefix_len = strlen(prefix_ptr); // allowed to be zero but not encouraged. +#if OPENFST_VER >= 10800 for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) { const std::string &sym = siter->Symbol(); +#else + for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) { + const std::string &sym = siter.Symbol(); +#endif if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) { // has prefix. if (isdigit(sym[prefix_len])) { // we don't allow prefix followed by a digit, as a symbol. // Has at least one digit. diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc index 7210e455413..60953e40b8d 100644 --- a/src/fstext/pre-determinize-test.cc +++ b/src/fstext/pre-determinize-test.cc @@ -22,8 +22,7 @@ #include "fstext/fst-test-utils.h" #include "fstext/fstext-utils.h" -// Just check that it compiles, for now. - +#include "fstext/openfst_compat.h" namespace fst { using std::vector; @@ -73,7 +72,7 @@ template void TestPreDeterminize() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -81,7 +80,7 @@ template void TestPreDeterminize() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -95,7 +94,7 @@ template void TestPreDeterminize() { std::cout <<" printing after predeterminization\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -111,7 +110,7 @@ template void TestPreDeterminize() { std::cout <<" printing after epsilon removal\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -121,14 +120,14 @@ template void TestPreDeterminize() { std::cout <<" printing after determinization\n"; { FstPrinter fstprinter(ofst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } int64 num_removed = DeleteISymbols(&ofst, extra_syms); std::cout <<" printing after removing "< fstprinter(ofst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Checking equivalent to original FST.\n"; @@ -180,7 +179,7 @@ template void TestAddSelfLoops() { std::cout <<" printing before adding self-loops\n"; { FstPrinter fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -199,7 +198,7 @@ template void TestAddSelfLoops() { std::cout <<" printing after adding self-loops\n"; { FstPrinter fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } delete fst; diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc index 5d8c40b6a75..f91001fca0d 100644 --- a/src/fstext/prune-special-test.cc +++ b/src/fstext/prune-special-test.cc @@ -22,6 +22,8 @@ #include "fstext/rand-fst.h" #include "fstext/fstext-utils.h" +#include "fstext/openfst_compat.h" + namespace fst { static void TestPruneSpecial() { @@ -38,7 +40,7 @@ static void TestPruneSpecial() { { FstPrinter fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); std::cout << std::endl; } @@ -47,7 +49,7 @@ static void TestPruneSpecial() { PruneSpecial(*ifst, &ofst1, beam); { FstPrinter fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); std::cout << std::endl; } @@ -56,7 +58,7 @@ static void TestPruneSpecial() { Prune(*ifst, &ofst2, beam); { FstPrinter fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); std::cout << std::endl; } diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc index 557b43d3062..9fe8ba63b59 100644 --- a/src/fstext/push-special-test.cc +++ b/src/fstext/push-special-test.cc @@ -23,6 +23,8 @@ #include "fstext/fstext-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { @@ -38,7 +40,7 @@ static void TestPushSpecial() { { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst fst_copy(*fst); @@ -56,7 +58,7 @@ static void TestPushSpecial() { { FstPrinter fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value(); diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc index 80cca875ff0..1548ac5c726 100644 --- a/src/fstext/remove-eps-local-test.cc +++ b/src/fstext/remove-eps-local-test.cc @@ -23,6 +23,7 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -83,7 +84,7 @@ template static void TestRemoveEpsLocal() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst fst_copy1(fst); @@ -96,7 +97,7 @@ template static void TestRemoveEpsLocal() { { std::cout << "copy1 = \n"; FstPrinter fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -141,7 +142,7 @@ static void TestRemoveEpsLocalSpecial() { { std::cout << "logfst = \n"; FstPrinter fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst fst; @@ -156,7 +157,7 @@ static void TestRemoveEpsLocalSpecial() { { std::cout << "logfst2 = \n"; FstPrinter fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) { // make sure we preserved stochasticity in cases where doing so was diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc index 2d39fe957dd..1cc8bd02bef 100644 --- a/src/fstext/table-matcher-test.cc +++ b/src/fstext/table-matcher-test.cc @@ -21,6 +21,8 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst{ @@ -64,13 +66,13 @@ template void TestTableMatcher(bool connect, bool left) { std::cout <<"Table-Composed FST\n"; { FstPrinter fstprinter(composed, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Baseline-Composed FST\n"; { FstPrinter fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if ( !RandEquivalent(composed, composed_baseline, 3/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 20/*path length-- max?*/)) { @@ -79,7 +81,7 @@ template void TestTableMatcher(bool connect, bool left) { std::cout <<" Diff1 (composed - baseline) \n"; { FstPrinter fstprinter(diff1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -88,7 +90,7 @@ template void TestTableMatcher(bool connect, bool left) { std::cout <<" Diff2 (baseline - composed) \n"; { FstPrinter fstprinter(diff2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(0); @@ -149,7 +151,7 @@ template void TestTableMatcherCacheLeft(bool connect) { std::cout <<" Diff1 (composed - baseline) \n"; { FstPrinter fstprinter(diff1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -158,7 +160,7 @@ template void TestTableMatcherCacheLeft(bool connect) { std::cout <<" Diff2 (baseline - composed) \n"; { FstPrinter fstprinter(diff2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(0); @@ -219,7 +221,7 @@ template void TestTableMatcherCacheRight(bool connect) { std::cout <<" Diff1 (composed - baseline) \n"; { FstPrinter fstprinter(diff1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -228,7 +230,7 @@ template void TestTableMatcherCacheRight(bool connect) { std::cout <<" Diff2 (baseline - composed) \n"; { FstPrinter fstprinter(diff2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(0); diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h index 290a4f8bc2e..9e921920c48 100644 --- a/src/fstext/table-matcher.h +++ b/src/fstext/table-matcher.h @@ -22,7 +22,7 @@ #include #include - +#include "base/kaldi-types.h" namespace fst { diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc index b4682443d29..556d194a60d 100644 --- a/src/fstext/trivial-factor-weight-test.cc +++ b/src/fstext/trivial-factor-weight-test.cc @@ -22,7 +22,8 @@ #include "fstext/determinize-star.h" #include "fstext/trivial-factor-weight.h" #include "fstext/fst-test-utils.h" -// Just check that it compiles, for now. + +#include "fstext/openfst_compat.h" namespace fst { @@ -73,7 +74,7 @@ template void TestFactor() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -81,7 +82,7 @@ template void TestFactor() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(&std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } vector