diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index c1f923cf58a..6b902ef54ad 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -16,9 +16,11 @@ jobs:
       CC: "ccache gcc"
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Install sox
       run: sudo apt-get install -y sox intel-mkl
+    - name: Install python2
+      run: sudo apt-get install -y python2
     - name: ccache
       uses: hendrikmuhs/ccache-action@v1.2
       with:
@@ -36,3 +38,9 @@ jobs:
       run: cd src &&  make -j 3
     - name: make test
       run: cd src && make test
+    - name: upload logs if failure
+      if: ${{ failure() }}
+      uses: actions/upload-artifact@v4
+      with:
+         name: fail-logs
+         path: ${{ github.workspace }}/src/**/*testlog
diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml
index d0bb01c5bf6..f63b761b5e2 100644
--- a/.github/workflows/docker-images.yml
+++ b/.github/workflows/docker-images.yml
@@ -3,6 +3,7 @@ name: Docker Image CI
 on:
   schedule:
   - cron: '37 2 * * 1'
+
   workflow_dispatch:
     inputs:
       logLevel:
@@ -14,17 +15,20 @@ on:
           - info
           - warning
           - debug
+
 #  pull_request: #for debugging purposes
 #    branches: [ "master" ]
 
 jobs:
+
   enable_build:
-    if: github.repository == 'kaldi-asr/kaldi'
+    #if: github.repository_owner == 'jtrmal' || github.repository_owner == 'kaldi-asr'
+    if: github.repository_owner == 'kaldi-asr'
     runs-on: ubuntu-latest
     outputs:
       enabled: ${{ steps.set-enabled.outputs.enabled }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Set enabled
@@ -38,22 +42,34 @@ jobs:
             enabled=true
           fi
           echo "enabled: $enabled"
-          echo "::set-output name=enabled::${enabled}"
+          echo "enabled=${enabled}" >> $GITHUB_OUTPUT
 
 
-  docker-buildx-gpu:
+  docker-buildx-gpu-12:
     needs: enable_build
     if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - name: Maximize build space
+        uses: AdityaGarg8/remove-unwanted-software@v4.1
+        with:
+          remove-android: 'true'
+          remove-dotnet: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
+          remove-docker-images: 'true'
+          remove-large-packages: 'true'
+          remove-cached-tools: 'true'
+          remove-swapfile: 'false'
+          verbose: 'true'
+      - uses: actions/checkout@v4
       - name: Set up Docker Buildx
         id: buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
         with:
           install: true
       - name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -66,22 +82,34 @@ jobs:
           echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
       - name: Build and push
         run: |
-          cd docker/ubuntu18.04-cuda10.0/
-          docker build --push --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu18.04-cuda10.0 --tag kaldiasr/kaldi:gpu-ubuntu18.04-cuda10.0-$(date +%F) .
+          cd docker/ubuntu22.04-cuda12
+          docker build --push  --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12 --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12-$(date +%F) .
 
-  docker-buildx-cpu:
+  docker-buildx-gpu-cuda11:
     needs: enable_build
     if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - name: Maximize build space
+        uses: AdityaGarg8/remove-unwanted-software@v4.1
+        with:
+          remove-android: 'true'
+          remove-dotnet: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
+          remove-docker-images: 'true'
+          remove-large-packages: 'true'
+          remove-cached-tools: 'true'
+          remove-swapfile: 'false'
+          verbose: 'true'
+      - uses: actions/checkout@v4
       - name: Set up Docker Buildx
         id: buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
         with:
           install: true
       - name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -94,10 +122,71 @@ jobs:
           echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
       - name: Build and push
         run: |
-          cd docker/debian10-cpu/
+          cd docker/ubuntu20.04-cuda11
+          docker build --push  --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11 --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11-$(date +%F) .
+
+  docker-buildx-cpu-openblas:
+    needs: enable_build
+    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          install: true
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Inspect builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+      - name: Build and push
+        run: |
+          cd docker/debian12-cpu/
           docker build --push \
                        --tag kaldiasr/kaldi:latest \
                        --tag kaldiasr/kaldi:cpu-latest  \
-                       --tag kaldiasr/kaldi:cpu-debian10 \
-                       --tag kaldiasr/kaldi:cpu-debian10-$(date +%F) .
+                       --tag kaldiasr/kaldi:cpu-latest-openblas  \
+                       --tag kaldiasr/kaldi:cpu-debian12-openblas \
+                       --tag kaldiasr/kaldi:cpu-debian12-openblas-$(date +%F) .
 
+  docker-buildx-cpu-mkl:
+    needs: enable_build
+    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          install: true
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Inspect builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+      - name: Build and push
+        run: |
+          cd docker/debian12-cpu-mkl/
+          docker build --push \
+                       --tag kaldiasr/kaldi:cpu-latest-mkl  \
+                       --tag kaldiasr/kaldi:cpu-debian12-mkl \
+                       --tag kaldiasr/kaldi:cpu-debian12-mkl-$(date +%F) .
+ 
+ 
diff --git a/COPYING b/COPYING
index 5a5cab00a29..2b0dbd4243a 100644
--- a/COPYING
+++ b/COPYING
@@ -57,72 +57,72 @@ License v 2.0 are set forth below.
 
 Individual Contributors (in alphabetical order)
 
-      Mohit Agarwal
-      Tanel Alumae
-      Gilles Boulianne
-      Lukas Burget
-      Dogan Can
-      Guoguo Chen
-      Gaofeng Cheng
+      Albert Vernon
+      Alexander Solovets
+      Allen Guo
+      Ariya Rastrow
+      Arnab Ghoshal
       Cisco Corporation
-      Pavel Denisov
-      Ilya Edrenkin
-      Ewald Enzinger
-      Joachim Fainberg
       Daniel Galvez
-      Pegah Ghahremani
-      Arnab Ghoshal
-      Ondrej Glembek
+      Daniel Povey
+      Danijel Korzinek
+      David Snyder
+      Dogan Can
+      Eduardo Silva
+      Ewald Enzinger
+      Gaofeng Cheng
+      Gaurav Kumar
+      Georg Stemmer
+      Gilles Boulianne
       Go Vivace Inc.
-      Allen Guo
-      Hossein Hadian
-      Lv Hang
-      Mirko Hannemann
+      Guoguo Chen
+      Haihua Xu
+      Hainan Xu
       Hendy Irawan
-      Navdeep Jaitly
+      Hossein Hadian
+      Ilya Edrenkin
+      Jan "Yenda" Trmal
+      Jan Silovsky
+      Joachim Fainberg
       Johns Hopkins University
-      Shiyin Kang
-      Kirill Katsnelson
-      Tom Ko
-      Danijel Korzinek
-      Gaurav Kumar
+      Karel Vesely
       Ke Li
+      Kirill Katsnelson
+      Lucas Ondel
+      Lukas Burget
+      Lv Hang
       Matthew Maciejewski
-      Vimal Manohar
-      Yajie Miao
       Microsoft Corporation
+      Minhua Wu
+      Mirko Hannemann
+      Mohit Agarwal
+      Navdeep Jaitly
+      Nickolay V. Shmyrev
+      Omid Sadjadi
+      Ondrej Glembek
+      Ondrej Platek
+      Pavel Denisov
+      Pawel Swietojanski
+      Pegah Ghahremani
+      Peter Smit
       Petr Motlicek
-      Xingyu Na
-      Vincent Nguyen
-      Lucas Ondel
-      Vassil Panayotov
-      Vijayaditya Peddinti
+      Petr Schwarz
       Phonexia s.r.o.
-      Ondrej Platek
-      Daniel Povey
-      Yanmin Qian
-      Ariya Rastrow
       Saarland University
-      Omid Sadjadi
-      Petr Schwarz
-      Yiwen Shao
-      Nickolay V. Shmyrev
-      Jan Silovsky
-      Eduardo Silva
-      Peter Smit
-      David Snyder
-      Alexander Solovets
-      Georg Stemmer
-      Pawel Swietojanski
-      Jan "Yenda" Trmal
-      Albert Vernon
-      Karel Vesely
-      Yiming Wang
       Shinji Watanabe
-      Minhua Wu
-      Haihua Xu
-      Hainan Xu
+      Shiyin Kang
+      Tanel Alumae
+      Tom Ko
+      Vassil Panayotov
+      Vijayaditya Peddinti
+      Vimal Manohar
+      Vincent Nguyen
       Xiaohui Zhang
+      Xingyu Na
+      Yajie Miao
+      Yanmin Qian
+      Yiming Wang
+      Yiwen Shao
 
 Other Source Material
 
diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py
index 5925c6369a8..c8fee4c415f 100644
--- a/cmake/gen_cmake_skeleton.py
+++ b/cmake/gen_cmake_skeleton.py
@@ -269,7 +269,7 @@ def gen_code(self):
 
         if len(self.depends) > 0:
             ret.append("target_link_libraries(" + self.target_name + " PUBLIC")
-            for d in self.depends:
+            for d in self.depends + ['-lcblas', '-llapack']:
                 ret.append("    " + d)
             ret.append(")\n")
 
diff --git a/docker/debian10-cpu/Dockerfile b/docker/debian12-cpu-mkl/Dockerfile
similarity index 52%
rename from docker/debian10-cpu/Dockerfile
rename to docker/debian12-cpu-mkl/Dockerfile
index 05079922d03..aae82d24b93 100644
--- a/docker/debian10-cpu/Dockerfile
+++ b/docker/debian12-cpu-mkl/Dockerfile
@@ -1,9 +1,10 @@
-FROM debian:10
-LABEL maintainer="rick@scriptix.io"
+FROM debian:12
+LABEL maintainer="jtrmal@apptek.com"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         g++ \
+        gfortran \
         make \
         automake \
         autoconf \
@@ -13,29 +14,21 @@ RUN apt-get update && \
         sox \
         libtool \
         git \
-        subversion \
-        python2.7 \
         python3 \
         zlib1g-dev \
         ca-certificates \
-        gfortran \
         patch \
-        ffmpeg \
-	vim && \
+        python-is-python3 && \
     rm -rf /var/lib/apt/lists/*
 
-RUN ln -s /usr/bin/python3 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL
 RUN    cd /opt/kaldi/tools && \
-       ./extras/install_mkl.sh && \
-       make -j $(nproc) && \
+       ./extras/install_mkl.sh  && \
+       make -j 5 && \
        cd /opt/kaldi/src && \
        ./configure --shared && \
        make depend -j $(nproc) && \
-       make -j $(nproc) && \
-       find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-       find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-       find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-       rm -rf /opt/kaldi/.git
+       make -j 5 
+
 WORKDIR /opt/kaldi/
diff --git a/docker/debian12-cpu/Dockerfile b/docker/debian12-cpu/Dockerfile
new file mode 100644
index 00000000000..6c286d6ba24
--- /dev/null
+++ b/docker/debian12-cpu/Dockerfile
@@ -0,0 +1,34 @@
+FROM debian:12
+LABEL maintainer="jtrmal@apptek.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        gfortran \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        patch \
+        python-is-python3 && \
+    rm -rf /var/lib/apt/lists/*
+
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL
+RUN    cd /opt/kaldi/tools && \
+       ./extras/install_openblas.sh  && \
+       make -j 5 && \
+       cd /opt/kaldi/src && \
+       ./configure --shared --mathlib=OPENBLAS && \
+       make depend -j $(nproc) && \
+       make -j 5 
+
+WORKDIR /opt/kaldi/
diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile
deleted file mode 100644
index ba694d1fb96..00000000000
--- a/docker/debian9.8-cpu/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-
-FROM debian:9.8
-LABEL maintainer="mdoulaty@gmail.com"
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        g++ \
-        make \
-        automake \
-        autoconf \
-        bzip2 \
-        unzip \
-        wget \
-        sox \
-        libtool \
-        git \
-        subversion \
-        python2.7 \
-        python3 \
-        zlib1g-dev \
-        ca-certificates \
-        gfortran \
-        patch \
-        ffmpeg \
-	vim && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
-
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
-    cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
-    make -j $(nproc) && \
-    cd /opt/kaldi/src && \
-    ./configure --shared && \
-    make depend -j $(nproc) && \
-    make -j $(nproc) && \
-    find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-    rm -rf /opt/kaldi/.git
-WORKDIR /opt/kaldi/
-
diff --git a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0 b/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
deleted file mode 100644
index 41fc78beb83..00000000000
--- a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
+++ /dev/null
@@ -1,44 +0,0 @@
-
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
-LABEL maintainer="mdoulaty@gmail.com"
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        g++ \
-        make \
-        automake \
-        autoconf \
-        bzip2 \
-        unzip \
-        wget \
-        sox \
-        libtool \
-        git \
-        subversion \
-        python2.7 \
-        python3 \
-        zlib1g-dev \
-        gfortran \
-        ca-certificates \
-        patch \
-        ffmpeg \
-	vim && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
-
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
-    cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
-    make -j $(nproc) && \
-    cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
-    make depend -j $(nproc) && \
-    make -j $(nproc) && \
-    find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
-    rm -rf /opt/kaldi/.git
-
-WORKDIR /opt/kaldi/
-
diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu20.04-cuda11/Dockerfile
similarity index 57%
rename from docker/ubuntu16.04-gpu/Dockerfile
rename to docker/ubuntu20.04-cuda11/Dockerfile
index 41fc78beb83..81126cd96ac 100644
--- a/docker/ubuntu16.04-gpu/Dockerfile
+++ b/docker/ubuntu20.04-cuda11/Dockerfile
@@ -1,44 +1,40 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+LABEL maintainer="jtrmal@apptek.com"
 
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
-LABEL maintainer="mdoulaty@gmail.com"
-
+ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
+    apt-get install -yqq --no-install-recommends \
+        build-essential \
         g++ \
         make \
         automake \
-        autoconf \
         bzip2 \
         unzip \
         wget \
-        sox \
         libtool \
         git \
-        subversion \
-        python2.7 \
         python3 \
         zlib1g-dev \
-        gfortran \
         ca-certificates \
+        gfortran \
         patch \
-        ffmpeg \
-	vim && \
+        sox \
+        software-properties-common && \
+        apt-add-repository multiverse && \
+        apt-get update && \
+        yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
+            intel-mkl && \
     rm -rf /var/lib/apt/lists/*
 
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
     cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
     make -j $(nproc) && \
     cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
+    ./configure --shared --use-cuda=yes && \
     make depend -j $(nproc) && \
     make -j $(nproc) && \
     find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
     rm -rf /opt/kaldi/.git
 
 WORKDIR /opt/kaldi/
-
diff --git a/docker/ubuntu18.04-cuda10.0/Dockerfile b/docker/ubuntu22.04-cuda12/Dockerfile
similarity index 61%
rename from docker/ubuntu18.04-cuda10.0/Dockerfile
rename to docker/ubuntu22.04-cuda12/Dockerfile
index 0c75863fedd..cb12b6abdd0 100644
--- a/docker/ubuntu18.04-cuda10.0/Dockerfile
+++ b/docker/ubuntu22.04-cuda12/Dockerfile
@@ -1,44 +1,39 @@
-
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
-LABEL maintainer="mdoulaty@gmail.com"
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04
+LABEL maintainer="jtrmal@apptek.com"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
+        build-essential \
         g++ \
         make \
         automake \
-        autoconf \
         bzip2 \
         unzip \
         wget \
-        sox \
         libtool \
         git \
-        subversion \
-        python2.7 \
         python3 \
         zlib1g-dev \
-        gfortran \
         ca-certificates \
+        gfortran \
         patch \
-        ffmpeg \
-	vim && \
+        sox \
+        software-properties-common && \
+        apt-add-repository multiverse && \
+        apt-get update && \
+        yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
+            intel-mkl && \
     rm -rf /var/lib/apt/lists/*
 
-RUN ln -s /usr/bin/python2.7 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
     cd /opt/kaldi/tools && \
-    ./extras/install_mkl.sh && \
     make -j $(nproc) && \
     cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda && \
+    ./configure --shared --use-cuda=yes && \
     make depend -j $(nproc) && \
     make -j $(nproc) && \
     find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
-    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
-    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
     rm -rf /opt/kaldi/.git
 
 WORKDIR /opt/kaldi/
-
diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
index 0d40d25c23a..ed91a980791 100755
--- a/egs/ami/s5/run_ihm.sh
+++ b/egs/ami/s5/run_ihm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh
index 4389c6b5d81..0cc76a56dd0 100755
--- a/egs/ami/s5/run_mdm.sh
+++ b/egs/ami/s5/run_mdm.sh
@@ -10,7 +10,7 @@ mic=mdm$nmics
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh
index 17e2071f1f6..a212a8846b2 100755
--- a/egs/ami/s5/run_sdm.sh
+++ b/egs/ami/s5/run_sdm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5b/cmd.sh b/egs/ami/s5b/cmd.sh
index b004c5569df..a8ea5d7c1ba 100644
--- a/egs/ami/s5b/cmd.sh
+++ b/egs/ami/s5b/cmd.sh
@@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G"
 # the use of cuda_cmd is deprecated, used only in 'nnet1',
 export cuda_cmd="queue.pl --gpu 1 --mem 20G"
 
-if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
+if [[ "$(hostname -d)" == "fit.vutbr.cz" ]]; then
   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
diff --git a/egs/ami/s5b/conf/ami_beamformit.cfg b/egs/ami/s5b/conf/ami_beamformit.cfg
new file mode 100644
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/ami/s5b/conf/ami_beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh
index 79989f17004..94cd81f230b 100755
--- a/egs/ami/s5b/run.sh
+++ b/egs/ami/s5b/run.sh
@@ -28,7 +28,7 @@ set -euo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5c/run.sh b/egs/ami/s5c/run.sh
index cc4cd87610b..1281cad2e43 100755
--- a/egs/ami/s5c/run.sh
+++ b/egs/ami/s5c/run.sh
@@ -3,7 +3,7 @@
 # Apache 2.0.
 #
 # This recipe performs diarization for the mix-headset data in the
-# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 
+# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2
 # corpus with simulated RIRs. We use oracle SAD in this recipe.
 # This recipe demonstrates the following:
 # 1. Diarization using x-vector and clustering (AHC, VBx, spectral)
@@ -38,7 +38,7 @@ diarizer_type=spectral  # must be one of (ahc, spectral, vbx)
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora5/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
@@ -57,7 +57,7 @@ if [ $stage -le 1 ]; then
   local/ami_download.sh $mic $AMI_DIR
 fi
 
-# Prepare data directories. 
+# Prepare data directories.
 if [ $stage -le 2 ]; then
   # Download the data split and references from BUT's AMI setup
   if ! [ -d AMI-diarization-setup ]; then
@@ -120,7 +120,7 @@ if [ $stage -le 6 ]; then
      transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- |\
       ivector-normalize-length ark:-  ark:- |" \
     $model_dir/xvectors_plda_train/plda || exit 1;
-  
+
   cp $model_dir/xvectors_plda_train/plda $model_dir/
   cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
   cp $model_dir/xvectors_plda_train/mean.vec $model_dir/
diff --git a/egs/gop_speechocean762/s5/local/visualize_feats.py b/egs/gop_speechocean762/s5/local/visualize_feats.py
index 3b3ddaa037a..202c6a57b6b 100644
--- a/egs/gop_speechocean762/s5/local/visualize_feats.py
+++ b/egs/gop_speechocean762/s5/local/visualize_feats.py
@@ -8,6 +8,7 @@
 import random
 import kaldi_io
 import seaborn as sns
+import numpy as np
 from collections import Counter
 from sklearn.manifold import TSNE
 from utils import load_human_scores, load_phone_symbol_table
@@ -62,6 +63,9 @@ def main():
                                   min(args.samples, len(lables)))
     features, lables = list(zip(*sampled_paris))
 
+    # Convert the tuple of arrays to a single 2D array
+    features = np.vstack(features)
+
     # Draw scatters
     label_counter = Counter(lables)
     colors = sns.color_palette("colorblind", len(label_counter))
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index ed4710d0b1f..051715f2b1e 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -54,7 +54,7 @@ function check_sorted {
 }
 
 for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
-    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
+    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames $utt_extra_files $spk_extra_files; do
   if [ -f $data/$x ]; then
     cp $data/$x $data/.backup/$x
     check_sorted $data/$x
diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
index aa0163235a6..2a5750c9238 100755
--- a/egs/wsj/s5/utils/subword/prepare_subword_text.sh
+++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -36,7 +36,7 @@ grep -q $separator $word_text && echo "$0: Error, word text file contains separa
 glossaries_opt=
 [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
 cut -d ' ' -f2- $word_text | \
-  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaries_opt > ${word_text}.sub
   if [ $word_text == $subword_text ]; then
     mv $word_text ${word_text}.old
     cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
diff --git a/egs/xbmu_amdo31/README.txt b/egs/xbmu_amdo31/README.txt
new file mode 100644
index 00000000000..d2cda16fa58
--- /dev/null
+++ b/egs/xbmu_amdo31/README.txt
@@ -0,0 +1,11 @@
+About the XBMU-AMDO31 corpus XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University. 
+
+XBMU-AMDO31 dataset is a speech recognition corpus of Tibetan Amdo dialect. The open source corpus contains 31 hours of speech data and resources related to build speech recognition systems,including transcribed texts and a Tibetan pronunciation lexicon. (The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused for the Amdo dialect because of the uniformity of the Tibetan language) The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
+
+The database can be downloaded from openslr:
+http://www.openslr.org/133/
+
+For more details, please visit: 
+https://huggingface.co/datasets/syzym/xbmu_amdo31
+
+This recipe includes some different ASR models trained with XBMU-AMDO31.
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/RESULTS b/egs/xbmu_amdo31/s5/RESULTS
new file mode 100644
index 00000000000..e50e43dc4db
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/RESULTS
@@ -0,0 +1,8 @@
+%WER 46.16 [ 15522 / 33628, 380 ins, 2208 del, 12934 sub ] exp/mono/decode_test/wer_10_0.0
+%WER 24.60 [ 8274 / 33628, 330 ins, 860 del, 7084 sub ] exp/tri1/decode_test/wer_13_0.0
+%WER 24.42 [ 8213 / 33628, 323 ins, 847 del, 7043 sub ] exp/tri2/decode_test/wer_13_0.0
+%WER 22.93 [ 7712 / 33628, 336 ins, 814 del, 6562 sub ] exp/tri3a/decode_test/wer_12_0.0
+%WER 20.17 [ 6783 / 33628, 275 ins, 764 del, 5744 sub ] exp/tri4a/decode_test/wer_15_0.0
+%WER 19.03 [ 6400 / 33628, 292 ins, 667 del, 5441 sub ] exp/tri5a/decode_test/wer_14_0.0
+%WER 15.45 [ 5196 / 33628, 229 ins, 646 del, 4321 sub ] exp/nnet3/tdnn_sp/decode_test/wer_16_0.0
+%WER 15.57 [ 5235 / 33628, 244 ins, 575 del, 4416 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_11_0.0
diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/xbmu_amdo31/s5/conf/decode.config b/egs/xbmu_amdo31/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc.conf b/egs/xbmu_amdo31/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/xbmu_amdo31/s5/conf/online_pitch.conf b/egs/xbmu_amdo31/s5/conf/online_pitch.conf
new file mode 100644
index 00000000000..c0f1342160d
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/xbmu_amdo31/s5/conf/pitch.conf b/egs/xbmu_amdo31/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..826aa163f2a
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_1a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=1
+num_jobs_final=2
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $*"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+#local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 5 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
new file mode 100755
index 00000000000..52d56adbc60
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -0,0 +1,211 @@
+#!/usr/bin/env bash
+
+# This script is based on run_tdnn_1a.sh.
+# This setup used online pitch to train the neural network.
+# It requires a online_pitch.conf in the conf dir.
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_2a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $*"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_online \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+dir=${dir}_online
+if [ $stage -le 15 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 16 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" --per-utt true \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/xbmu_amdo31/s5/local/download_and_untar.sh b/egs/xbmu_amdo31/s5/local/download_and_untar.sh
new file mode 100755
index 00000000000..9c70836bf46
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..610774fb2a2
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev test"
+gmm=tri5a
+online=false
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+online_affix=
+if [ $online = true ]; then
+  online_affix=_online
+fi
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires$online_affix
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$((num_utts_total/4))
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 5 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+train_set=train_sp
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 5 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..f0cee7329ba
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=1
+num_jobs_final=2
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=$(cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l)
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
new file mode 100755
index 00000000000..3f920315b77
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+
+# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
+
+# In this script, the neural network in trained based on hires mfcc and online pitch.
+# The online pitch setup requires a online_pitch.conf in the conf dir for both training
+# and testing.
+
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires_online \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=$(cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l)
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # do the actual online decoding with iVectors, carrying info forward from
+  # previous utterances of the same speaker.
+  for decode_set in dev test; do
+    # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l)
+    decode_dir=${dir}_online/decode_$decode_set
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l)
+    decode_dir=${dir}_online/decode_${decode_set}_per_utt
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config --per-utt true \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/score.sh b/egs/xbmu_amdo31/s5/local/score.sh
new file mode 100755
index 00000000000..d283ceb68dc
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/xbmu_amdo31/s5/local/wer_hyp_filter b/egs/xbmu_amdo31/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..c6660e4efe1
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SPOKEN_NOISE>','<UNK>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/xbmu_amdo31/s5/local/wer_output_filter b/egs/xbmu_amdo31/s5/local/wer_output_filter
new file mode 100755
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/xbmu_amdo31/s5/local/wer_ref_filter b/egs/xbmu_amdo31/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..c6660e4efe1
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SPOKEN_NOISE>','<UNK>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
new file mode 100755
index 00000000000..a3ba6fabaf4
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Xingyu Na
+#           2021 Northwest Minzu University (senyan Li)
+#Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/data/xbmu_amdo31/data/wav /export/data/xbmu_amdo31/data/transcript"
+  exit 1;
+fi
+
+tibetan_audio_dir=$1
+tibetan_text=$2/transcript_clean.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $tibetan_audio_dir ] || [ ! -f $tibetan_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+echo $tibetan_audio_dir
+# find wav audio file for train, dev and test resp.
+find $tibetan_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=$(wc -l < "$tmp_dir/wav.flist")
+[ $n -ne 22630 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+# Transcriptions preparation
+# cat $tibetan_text |head -10
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}'  > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all
+  rm -f $dir/transcripts1.txt
+  while read -r line
+  do
+      line1=$(echo "$line" | cut -d '-' -f 2)
+      line2=$(grep -w $line1  $tibetan_text |cut -d " " -f 2-)
+      text=$line" "$line2
+      echo $text >>$dir/transcripts1.txt
+  done < "$dir/utt.list"
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: tibetan data preparation succeeded"
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
new file mode 100755
index 00000000000..1e5537858ff
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+# prepare dict resources
+
+. ./path.sh
+
+[ $# != 1 ] && echo "Usage: $0 <resource-path>" && exit 1;
+
+res_dir=$1
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+cp $res_dir/lexicon.txt $dict_dir
+
+cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
+    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
+    foreach $l (values %q) {print "$l\n";}
+ ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+echo sil > $dict_dir/silence_phones.txt
+
+echo sil > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "$0: Tibetan dict preparation succeeded"
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
new file mode 100755
index 00000000000..658f0e7bc15
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=$(command -v train_lm.sh)
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SPOKEN_NOISE>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<SPOKEN_NOISE>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh
new file mode 100755
index 00000000000..b70ffbfbb26
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=$(pwd)/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/xbmu_amdo31/s5/run.sh b/egs/xbmu_amdo31/s5/run.sh
new file mode 100755
index 00000000000..61b3e8f62d8
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/run.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+
+# Copyright Copyright 2021 Northwest Minzu University (Authors: Senyan Li)
+#           2017 Hui Bu
+#           2017 Jiayu Du
+#           2017 Xingyu Na
+#           2017 Bengu Wu
+#           2017 Hao Zheng
+# Apache 2.0
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+# Caution: some of the graph creation steps use quite a bit of memory, so you
+# should run this on a machine that has sufficient memory.
+
+# corpus directory and download URL
+data=/home1/lsy/kaldi/egs/xbmu_amdo31/s5/export/data
+data_url=www.openslr.org/resources/133
+
+. ./cmd.sh
+
+#local/download_and_untar.sh $data $data_url xbmu-amdo31 || exit 1;
+
+# Lexicon Preparation,
+local/xbmu_amdo31_prepare_dict.sh $data/xbmu_amdo31/resource || exit 1;
+
+# Data Preparation,
+local/xbmu_amdo31_data_prep.sh $data/xbmu_amdo31/data/wav $data/xbmu_amdo31/data/transcript || exit 1;
+
+# Phone Sets, questions, L compilation
+utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
+    "<SPOKEN_NOISE>" data/local/lang data/lang || exit 1;
+
+# LM training
+local/xbmu_amdo31_train_lms.sh || exit 1;
+
+# G compilation, check LG composition
+utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+    data/local/dict/lexicon.txt data/lang_test || exit 1;
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+for x in train dev test; do
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  utils/fix_data_dir.sh data/$x || exit 1;
+done
+
+# Train a monophone model on delta features.
+steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono || exit 1;
+
+# Decode with the monophone model.
+utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/mono/graph data/dev exp/mono/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/mono/graph data/test exp/mono/decode_test
+
+# Get alignments from monophone system.
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+# Train the first triphone pass model tri1 on delta + delta-delta features.
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+# decode tri1
+utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri1/graph data/dev exp/tri1/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri1/graph data/test exp/tri1/decode_test
+
+# align tri1
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+# train tri2 [delta+delta-deltas]
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+# decode tri2
+utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri2/graph data/dev exp/tri2/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
+  exp/tri2/graph data/test exp/tri2/decode_test
+
+# Align training data with the tri2 model.
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+# Train the second triphone pass model tri3a on LDA+MLLT features.
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+ 
+# Run a test decode with the tri3a model.
+utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri3a/graph data/dev exp/tri3a/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri3a/graph data/test exp/tri3a/decode_test
+
+# align tri3a with fMLLR
+
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+# Train the third triphone pass model tri4a on LDA+MLLT+SAT features.
+# From now on, we start building a more serious system with Speaker
+# Adaptive Training (SAT).
+steps/train_sat.sh --cmd "$train_cmd" \
+  2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+# decode tri4a
+utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri4a/graph data/dev exp/tri4a/decode_dev
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+  exp/tri4a/graph data/test exp/tri4a/decode_test
+  
+# align tri4a with fMLLR
+steps/align_fmllr.sh  --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri4a exp/tri4a_ali
+
+# Train tri5a, which is LDA+MLLT+SAT
+# Building a larger SAT system. You can see the num-leaves is 3500 and tot-gauss is 100000
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+  
+# decode tri5a
+utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+   exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
+   exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+   
+# align tri5a with fMLLR
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+# nnet3
+local/nnet3/run_tdnn.sh
+
+# chain
+local/chain/run_tdnn.sh
+
+# getting results (see RESULTS file)
+for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+for x in exp/*/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+
+exit 0;
diff --git a/egs/xbmu_amdo31/s5/steps b/egs/xbmu_amdo31/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/utils b/egs/xbmu_amdo31/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/xbmu_amdo31/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/src/Makefile b/src/Makefile
index 4d4efbc0172..5036d12b707 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER)
 endif
 endif
 
+ifeq ($(ROCM), true)
+ifeq ($(WITH_CUDADECODER), true)
+SUBDIRS += $(CUDADECODER)
+endif
+endif
+
 SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
 SUBDIRS_BIN = $(filter     %bin, $(SUBDIRS))
 
@@ -56,14 +62,16 @@ endif
 
 # Don't call rm -rf.
 rmlibdir:
+ifeq ($(KALDI_FLAVOR), dynamic)
 ifneq ($(KALDILIBDIR), )
-	-rm -f $(KALDILIBDIR)/*{.so,.a,.o}
+	-rm -f $(KALDILIBDIR)/*{.so,.a,.o,.dylib}
 	-rmdir 2>/dev/null $(KALDILIBDIR); true
 else
 # KALDILIBDIR might have been unset because of reconfigure. Do a best guess.
 	@echo "Something seems wrong. Please re-run configure."
 	@echo "I will continue but the cleanup might not be complete."
 endif
+endif
 
 kaldi.mk:
 	@echo "ERROR: kaldi.mk does not exist; run ./configure first.";
diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 31440edf3f9..68ef224b5f5 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -76,7 +76,7 @@ int main() {
     kaldi::UnitTestError();
     KALDI_ASSERT(0); // should not happen.
     exit(1);
-  } catch (kaldi::KaldiFatalError &e) {
+  } catch (const kaldi::KaldiFatalError &e) {
     std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n";
   }
 }
diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index a9904a752cd..572cbb4effd 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -185,12 +185,12 @@ class MessageLogger {
 #define KALDI_ASSERT(cond)                                                     \
   do {                                                                         \
     if (cond)                                                                  \
-      (void)0;                                                                 \
+      (void)(cond);                                                            \
     else                                                                       \
       ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
   } while (0)
 #else
-#define KALDI_ASSERT(cond) (void)0
+#define KALDI_ASSERT(cond) (void)(cond)
 #endif
 
 // Some more expensive asserts only checked if this defined.
@@ -198,12 +198,12 @@ class MessageLogger {
 #define KALDI_PARANOID_ASSERT(cond)                                            \
   do {                                                                         \
     if (cond)                                                                  \
-      (void)0;                                                                 \
+      (void)(cond);                                                            \
     else                                                                       \
       ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
   } while (0)
 #else
-#define KALDI_PARANOID_ASSERT(cond) (void)0
+#define KALDI_PARANOID_ASSERT(cond) (void)(cond)
 #endif
 
 /***** THIRD-PARTY LOG-HANDLER *****/
diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
index 7ebf4f85386..6d96ecf2b75 100644
--- a/src/base/kaldi-types.h
+++ b/src/base/kaldi-types.h
@@ -39,9 +39,21 @@ typedef float   BaseFloat;
 // we find in the future lacks stdint.h
 #include <stdint.h>
 
-// for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
+#if OPENFST_VER >= 10800
+typedef int8_t   int8;
+typedef int16_t  int16;
+typedef int32_t  int32;
+typedef int64_t  int64;
+
+typedef uint8_t  uint8;
+typedef uint16_t uint16;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef float    float32;
+typedef double   double64;
+#else
 #include <fst/types.h>
+#endif
 
 namespace kaldi {
   using ::int16;
@@ -53,23 +65,4 @@ namespace kaldi {
   typedef float   float32;
   typedef double double64;
 }  // end namespace kaldi
-
-// In a theoretical case you decide compile Kaldi without the OpenFST
-// comment the previous namespace statement and uncomment the following
-/*
-namespace kaldi {
-  typedef int8_t   int8;
-  typedef int16_t  int16;
-  typedef int32_t  int32;
-  typedef int64_t  int64;
-
-  typedef uint8_t  uint8;
-  typedef uint16_t uint16;
-  typedef uint32_t uint32;
-  typedef uint64_t uint64;
-  typedef float    float32;
-  typedef double   double64;
-}  // end namespace kaldi
-*/
-
 #endif  // KALDI_BASE_KALDI_TYPES_H_
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
index 3c93dfd0d39..6aee0c5ce78 100644
--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@@ -49,7 +49,7 @@ int32 TypeOneUsage(const ParseOptions &po,
   }
 
   int32 n_utts = 0, n_total_matrices = 0,
-      n_success = 0, n_missing = 0, n_other_errors = 0;
+      n_success = 0, n_missing = 0;
 
   for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
     std::string key = matrix_reader1.Key();
@@ -78,7 +78,6 @@ int32 TypeOneUsage(const ParseOptions &po,
                      << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
                      << " by " << matrix_out.NumCols()
                      << " primary matrix, rspecifier:" << matrix_in_fn1;
-          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No matrix found for utterance " << key << " for "
@@ -124,7 +123,7 @@ int32 TypeOneUsageAverage(const ParseOptions &po) {
   }
 
   int32 n_utts = 0, n_total_matrices = 0,
-      n_success = 0, n_missing = 0, n_other_errors = 0;
+      n_success = 0, n_missing = 0;
 
   for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
     std::string key = matrix_reader1.Key();
@@ -151,7 +150,6 @@ int32 TypeOneUsageAverage(const ParseOptions &po) {
                      << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
                      << " by " << matrix_out.NumCols()
                      << " primary matrix, rspecifier:" << matrix_in_fn1;
-          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No matrix found for utterance " << key << " for "
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 22d4d92055d..535c18365ed 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -172,7 +172,8 @@ int main(int argc, char *argv[]) {
         if (g_kaldi_verbose_level >= 2) {
           KALDI_LOG << "phn2word FST is below:";
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
-          fstprinter.Print(std::cerr, "standard error");
+          printer_print(std::cerr, fstprinter, "standard error");
+          //fstprinter.Print(&std::cerr, "standard error");
           KALDI_LOG << "phone sequence is: ";
           for (size_t i = 0; i < phones.size(); i++)
             std::cerr << phones[i] << ' ';
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index 3e622cafdc7..d03bf671245 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -52,7 +52,7 @@ int32 TypeOneUsage(const ParseOptions &po) {
   }
 
   int32 n_utts = 0, n_total_vectors = 0,
-      n_success = 0, n_missing = 0, n_other_errors = 0;
+      n_success = 0, n_missing = 0;
 
   for (; !vector_reader1.Done(); vector_reader1.Next()) {
     std::string key = vector_reader1.Key();
@@ -75,7 +75,6 @@ int32 TypeOneUsage(const ParseOptions &po) {
                      << "system " << (i + 2) << ", rspecifier: "
                      << vector_in_fns[i] << " vs " << vector_out.Dim()
                      << " primary vector, rspecifier:" << vector_in_fn1;
-          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No vector found for utterance " << key << " for "
diff --git a/src/chain/Makefile b/src/chain/Makefile
index fbad28f7de6..dbe6c38709f 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test
 OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
           language-model.o chain-denominator.o chain-training.o \
           chain-generic-numerator.o
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
   OBJFILES += chain-kernels.o
 endif
 
@@ -28,7 +28,14 @@ ifeq ($(CUDA), true)
 endif
 
 # Implicit rule for kernel compilation,
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+endif
+ifeq ($(ROCM), true)
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endif
+
 
 include ../makefiles/default_rules.mk
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index f5814d7c11c..48c80cc8d92 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -22,6 +22,10 @@
 #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
 #include "chain/chain-datastruct.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#endif
+
 #if HAVE_CUDA == 1
 extern "C" {
 
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index a63944f0012..ad6691fc895 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -20,6 +20,11 @@
 #include <cfloat>
 #include "chain/chain-kernels-ansi.h"
 
+#ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
+#include <hip/hip_runtime.h>
+#endif
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \
          configure with --use-cuda=no (this will disable the use of GPU).
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 57fe15923b0..b29000a448c 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -571,9 +571,8 @@ void Supervision::Write(std::ostream &os, bool binary) const {
       // Write using StdAcceptorCompactFst, making use of the fact that it's an
       // acceptor.
       fst::FstWriteOptions write_options("<unknown>");
-      fst::StdCompactAcceptorFst(
-          fst).Write(os,
-          write_options);
+      fst::StdCompactAcceptorFst cfst(fst);
+      cfst.Write(os, write_options);
     }
   } else {
     KALDI_ASSERT(e2e_fsts.size() == num_sequences);
@@ -586,9 +585,8 @@ void Supervision::Write(std::ostream &os, bool binary) const {
         // Write using StdAcceptorCompactFst, making use of the fact that it's an
         // acceptor.
         fst::FstWriteOptions write_options("<unknown>");
-        fst::StdCompactAcceptorFst(
-            e2e_fsts[i]).Write(os,
-            write_options);
+        fst::StdCompactAcceptorFst cfst(e2e_fsts[i]);
+        cfst.Write(os, write_options);
       }
     }
     WriteToken(os, binary, "</Fsts>");
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 0117fe2200f..60a2645b31b 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -347,7 +347,7 @@ int main(int argc, char *argv[]) {
                                             // not configurable for now.
     exclude_names.push_back(std::string("ivector"));
 
-    int64 num_read = 0, num_written = 0, num_err = 0;
+    int64 num_read = 0, num_written = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
       const std::string &key = example_reader.Key();
       NnetChainExample &eg = example_reader.Value();
@@ -361,7 +361,6 @@ int main(int argc, char *argv[]) {
         BaseFloat weight = 1.0;
         if (!egs_weight_reader.HasKey(key)) {
           KALDI_WARN << "No weight for example key " << key;
-          num_err++;
           continue;
         }
         weight = egs_weight_reader.Value(key);
@@ -371,7 +370,6 @@ int main(int argc, char *argv[]) {
       if (!eg_output_name_rspecifier.empty()) {
         if (!output_name_reader.HasKey(key)) {
           KALDI_WARN << "No new output-name for example key " << key;
-          num_err++;
           continue;
         }
         std::string new_output_name = output_name_reader.Value(key);
diff --git a/src/configure b/src/configure
index ed627eceedc..3743c31f76b 100755
--- a/src/configure
+++ b/src/configure
@@ -39,7 +39,7 @@
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=14
+CONFIGURE_VERSION=15
 
 # We support bash version 3.2 (Macs still ship with this version as of 2019)
 # and above.
@@ -74,6 +74,9 @@ Configuration options:
   --cudatk-dir=DIR      CUDA toolkit directory
   --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
          https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
+  --use-rocm            Build with ROCm
+  --rocm-dir=DIR        ROCM directory
+  --rocm-targets=TGTS   Comma separated list of GPU targets to target through ROCm
   --debug-level=N       Use assertion level 0 (disabled), 1, or 2 [default=1]
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
@@ -248,6 +251,71 @@ function check_for_slow_expf {
   fi
 }
 
+# ROCM is used only in selected directories including src/cudamatrix, src/nnet*
+# and src/chain*. It is used to accelerate the neural network training.
+# The rest of Kaldi runs on CPUs.
+
+function configure_rocm {
+  # Check for ROCM in the system
+  if [ ! -d "$ROCMDIR" ]; then
+    for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
+      if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then
+        ROCMDIR=$base
+        break
+      fi
+    done
+  fi
+
+  if [ -d "$ROCMDIR" ]; then
+    if [ ! -f $ROCMDIR/bin/hipcc ]; then
+      failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR"
+    fi
+  fi
+  echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
+  echo >> kaldi.mk
+  echo "# ROCm configuration" >> kaldi.mk
+  echo >> kaldi.mk
+  echo IS_GPU_BUILD = true >> kaldi.mk
+  echo ROCM = true >> kaldi.mk
+  echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
+  echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk 
+
+  echo "CUDA_ARCH = " >> kaldi.mk
+  echo "ROCM_ARCH_FLAGS = " >> kaldi.mk
+  for i in ${ROCM_TARGETS//,/ } ; do
+    echo "Targetting ROCm arch $i"
+    echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk
+  done
+  
+  echo "HOST_ARCH = `uname -m`" >> kaldi.mk
+  echo >> kaldi.mk
+  
+  ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1)
+  echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk
+  ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
+  echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk
+  
+  # Only ROCm 5.2+ is supported.
+  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then
+    echo "\
+WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required."
+    exit 1;
+  fi
+  
+  # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
+  # use direct calls to uname -m here
+  if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
+    cat makefiles/hip_64bit.mk >> kaldi.mk
+  else
+    echo "\
+WARNING: ROCm will not be used!
+         ROCm is only supported with 64-bit Linux builds."
+    exit 1;
+  fi
+}
+
+
+
 # CUDA is used only in selected directories including src/cudamatrix, src/nnet*
 # and src/chain*. It is used to accelerate the neural network training.
 # The rest of Kaldi runs on CPUs.
@@ -283,6 +351,7 @@ Either your CUDA is too new or too old."
       GCC_VER=$($CXX -dumpversion)
       GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
       case $CUDA_VERSION in
+        # Update this list by consulting https://gist.github.com/ax3l/9489132
         # Disabling CUDA 7 and CUDA 8 because we now use C++14 to compile CUDA
         # code. It is still possible to use those cuda versions by switching
         # back to C++11 in src/makefiles/cuda_64bit.mk and use CUB <= 1.8.0.
@@ -317,16 +386,23 @@ Either your CUDA is too new or too old."
         11_*)
           MIN_UNSUPPORTED_GCC_VER="12.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=120000
-          ;;
+          CUSOLVER=true
+        ;;
+        12_*)
+          MIN_UNSUPPORTED_GCC_VER="12.3"
+          MIN_UNSUPPORTED_GCC_VER_NUM=123000
+          CUSOLVER=true
+        ;;
         *)
           failure "Unsupported CUDA version ${CUDA_VERSION}.
 Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
  output of either 'nvcc -h' or 'ptxas -h'."
           ;;
       esac
-      (( GCC_VER_NUM < MIN_UNSUPPORTED_GCC_VER_NUM )) ||
+      if [ $GCC_VER_NUM -ge $MIN_UNSUPPORTED_GCC_VER_NUM ]; then
         failure "CUDA $CUDA_VERSION does not support $CXX (g++-$GCC_VER).\
  Only versions strictly older than $MIN_UNSUPPORTED_GCC_VER are supported."
+      fi
 
       case $CUDA_VERSION in
         [1-8]_* | 9_0) CUSOLVER=false ;;
@@ -345,6 +421,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
             10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;;
             11_0) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80" ;;
             11_*) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" ;;
+            12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" ;;
             *) failure \
                  "Unsupported CUDA version ${CUDA_VERSION}. Please open an" \
                  "issue at https://github.com/kaldi-asr/kaldi/issues and" \
@@ -371,6 +448,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
     echo "# CUDA configuration" >> kaldi.mk
     echo >> kaldi.mk
 
+    echo IS_GPU_BUILD = true >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
@@ -602,7 +680,8 @@ ENV_LDLIBS=$LDLIBS
 debug_level=1
 double_precision=false
 dynamic_kaldi=false
-use_cuda=true
+use_cuda=false
+use_rocm=false
 with_cudadecoder=true
 static_fst=false
 static_math=false
@@ -651,8 +730,11 @@ do
   --atlas-root=*)
     GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
-  --use-cuda)
-    use_cuda=true;
+  --use-rocm)
+    use_rocm=true;
+    shift ;;
+  --use-rocm=no)
+    use_rocm=false;
     shift ;;
   --use-cuda=yes)
     use_cuda=true;
@@ -729,6 +811,13 @@ do
   --mathlib=*)
     GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
+  --rocm-dir=*)
+    # ROCM is used in src/cudamatrix and src/nnet{,bin} only.
+    GetSwitchExistingPathOrDie ROCMDIR "$1"
+    shift ;;
+  --rocm-targets=*)
+    GetSwitchValueOrDie ROCM_TARGETS "$1"
+    shift ;;
   --cudatk-dir=*)
     # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
     GetSwitchExistingPathOrDie CUDATKDIR "$1"
@@ -935,6 +1024,14 @@ OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"
 if [ $OPENFST_VER_NUM -lt 10600 ]; then
   failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)"
 fi
+
+if [ $OPENFST_VER_NUM -lt 10800 ]; then
+  echo "CXXLANGVERSION = c++14"
+else
+  echo "CXXLANGVERSION = c++17"
+fi >> kaldi.mk
+
+echo "OPENFSTVER = $OPENFST_VER_NUM" >> kaldi.mk
 echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
 if $static_fst ; then
   OPENFSTLIBS="$FSTROOT/lib/libfst.a"
@@ -968,7 +1065,11 @@ if $use_cuda; then
    fi
    echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
 else
-   echo "WITH_CUDADECODER = false" >> kaldi.mk
+   if $use_rocm; then
+     echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
+   else
+     echo "WITH_CUDADECODER = false" >> kaldi.mk
+   fi
 fi
 echo >> kaldi.mk
 
@@ -1057,6 +1158,8 @@ elif [ "`uname`" == "Darwin" ]; then
       cat makefiles/darwin_clapack.mk >> kaldi.mk
       echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
       echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT"
+    elif [ "$(uname -m)" == "arm64" ]; then
+      cat makefiles/darwin_arm64.mk >> kaldi.mk
     else
       cat makefiles/darwin.mk >> kaldi.mk
     fi
@@ -1304,6 +1407,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
   $use_cuda && configure_cuda
+  $use_rocm && configure_rocm
   linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index e2569e89ab7..a7972f1831d 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -3,13 +3,15 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 TESTFILES =
 
@@ -34,8 +36,14 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+ifeq ($(ROCM), true)
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+endif
 
 else
 all:
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu
index f02a78ed1af..429d9f72326 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.cu
+++ b/src/cudadecoder/batched-static-nnet3-kernels.cu
@@ -17,6 +17,11 @@
 
 #include "cudadecoder/batched-static-nnet3-kernels.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "hipify.h"
+#endif
+
 #include <stdio.h>
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
index 45064e15071..fec2470a9db 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.h
+++ b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -17,7 +17,13 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 #include "base/kaldi-types.h"
 
 #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index 6e78d7212fd..bec20cb9e07 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -21,7 +21,13 @@
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
 
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include <mutex>
 #include <numeric>
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index 89e93e5d98c..32d7ac40e12 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -26,7 +26,13 @@
 
 #include <memory>
 
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-fst.h"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index c076910672a..4b30c568e73 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -23,7 +23,13 @@
 
 #include <atomic>
 
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
index fc0d2cddd2c..add66312817 100644
--- a/src/cudadecoder/cuda-decoder-kernels-utils.h
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
   value.i2 = val;
   if (old.i2.x <= val.x) return;
   do {
-    assumed = old;
+    assumed.ull = old.ull;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
   } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
 }
@@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) {
   UInt64UnionInt2 old, assumed, value;
   old.ull = *ptr64;
   do {
-    assumed = old;
+    assumed.ull = old.ull;
     value.i2.x = assumed.i2.x - sub.x;
     value.i2.y = assumed.i2.y - sub.y;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index 3a835d02b76..e20a7dea15c 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -15,10 +15,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+
+#include "float.h"
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cuda-decoder-kernels.h"
 #include "cuda-decoder-kernels-utils.h"
 
+#ifndef FLT_MAX
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+#endif
+
 namespace kaldi {
 namespace cuda_decoder {
 
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 1ec456ac32c..15f29d27122 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -37,8 +37,15 @@
 #include <utility>
 #include <vector>
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-decoder-kernels.h"
@@ -184,35 +191,36 @@ void CudaDecoder::AllocateDeviceData() {
 void CudaDecoder::AllocateHostData() {
   channel_to_compute_.resize(nlanes_);
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_and_acoustic_cost_concat_,
+      (void **)&h_extra_and_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_acoustic_cost_concat_,
+      (void **)&h_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_prev_tokens_concat_,
+      (void **)&h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_infotoken_concat_,
+      (void **)&h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_,
+      cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_,
                      nlanes_ * main_q_capacity_ *
                          sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_acoustic_cost_concat_tmp_,
+      (void **)&h_acoustic_cost_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_extra_prev_tokens_concat_tmp_,
+      (void **)&h_extra_prev_tokens_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_infotoken_concat_tmp_,
+      (void **)&h_infotoken_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
   h_lanes_counters_.Resize(
       nlanes_ + 1,
       1);  // +1 because we sometimes need last+1 value (for offsets)
-  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost((void **)&h_channels_counters_,
+                     nchannels_ * sizeof(*h_channels_counters_)));
 
   h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
   h_all_tokens_acoustic_cost_.resize(nchannels_);
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index de2bd09f47c..f6ee37512e2 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -20,7 +20,13 @@
 
 #if HAVE_CUDA
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 
 #include <atomic>
 #include <cfloat>
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 56066ee069d..6b0d34f81b7 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -22,8 +22,15 @@
 #include "cudadecoder/cuda-fst.h"
 #include "cudamatrix/cu-common.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/lattice-postprocessor.cc b/src/cudadecoder/lattice-postprocessor.cc
index 46d44216890..49f96191787 100644
--- a/src/cudadecoder/lattice-postprocessor.cc
+++ b/src/cudadecoder/lattice-postprocessor.cc
@@ -78,13 +78,14 @@ bool LatticePostprocessor::GetPostprocessedLattice(
   KALDI_ASSERT(decoder_frame_shift_ != 0.0 &&
                "SetDecoderFrameShift() must be called (typically by pipeline)");
 
-  if (!word_info_)
-    KALDI_ERR << "You must set --word-boundary-rxfilename in the lattice "
-                 "postprocessor config";
-  // ok &=
-  // Ignoring the return false for now (but will print a warning),
-  // because the doc says we can, and it can happen when using endpointing
-  WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat);
+  if (word_info_) {
+    // ok &=
+    // Ignoring the return false for now (but will print a warning),
+    // because the doc says we can, and it can happen when using endpointing
+    WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat);
+  } else {
+    *out_clat = clat;
+  }
   return ok;
 }
 
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 1f093299eb4..96b00c06101 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -2,13 +2,15 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index 1aba7144af1..a47ea2e2300 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -23,9 +23,15 @@
 #error CUDA support must be configured to compile this binary.
 #endif
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "hipify.h"
+#include "roctracer/roctx.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include <algorithm>
 #include <iomanip>
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 46138116bd8..06aac47b5e0 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -17,9 +17,15 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include "hip/hip_runtime.h"
+#include "hipify.h"
+#include "roctracer/roctx.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 #include <sstream>
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
 #include "cudamatrix/cu-allocator.h"
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index 992b34598d2..b7a9d463214 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -18,9 +18,17 @@
 #include <atomic>
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include <sstream>
 
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index 54bcc53af1e..d7739dae623 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -2,13 +2,15 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
+endif
 
 TESTFILES =
 
@@ -37,9 +39,14 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
+ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
-
+endif
+ifeq ($(ROCM), true)
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
+endif
 else
 all:
 		$(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]")
diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index c839548d6eb..1df9c6a7a43 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -15,7 +15,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h"
 
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
@@ -24,6 +30,7 @@ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   retval.y = a.y - b.y;
   return retval;
 }
+
 __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   float2 retval;
   retval.x = a.x + b.x;
@@ -31,11 +38,6 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   return retval;
 }
 
-__device__ inline void atomicAdd(float2 *addr, float2 val) {
-  atomicAdd(reinterpret_cast<float *>(addr), val.x);
-  atomicAdd(reinterpret_cast<float *>(addr) + 1, val.y);
-}
-
 __device__ inline void operator+=(float2 &a, float2 &b) {
   // overloading +=
   a.x += b.x;
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index 0b57d6a32ea..5b94c34e829 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -16,7 +16,13 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
@@ -45,7 +51,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols,
                            const float *feats, int32_t ldf, int32_t stridef,
                            float *feats_sq, int32_t lds, int32_t strides,
                            const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (chunk_frames + threads.y - 1) / threads.y, num_lanes);
 
@@ -96,8 +102,11 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss,
                              float *posteriors, int32_t ldp, int32_t stridep,
                              int32_t right, const LaneDesc *lanes,
                              int32_t num_lanes) {
-  dim3 threads(32, 32);
-  dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) /
+                  GPU_MAX_WARPS_PER_BLOCK,
+              num_lanes);
 
   zero_invalid_posteriors_kernel<<<blocks, threads>>>(
       num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes,
@@ -210,8 +219,11 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
                              int32_t stridest, float *spliced_feats,
                              int32_t lds, int32_t strides,
                              const LaneDesc *lanes, int32_t num_lanes) {
-  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
-  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK)
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                          // GPU_MAX_THREADS_PER_BLOCK threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
@@ -306,10 +318,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
     // First we need to shift feats to handle the case where num_chunk_frames
     // is less than stash size
 
-    KALDI_ASSERT(stash_size <= 32);
-    // This only works if stash size is <= 32 as we rely on __syncthreads()
-    // to avoid read/write hazards when reading/writing in-place
-    dim3 threads(32, 32);
+    KALDI_ASSERT(stash_size <= GPU_WARP_SIZE);
+    // This only works if stash size is <= GPU_WARP_SIZE as we rely on
+    // __syncthreads() to avoid read/write hazards when reading/writing in-place
+    dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
     dim3 blocks(num_lanes);
 
     shift_feats_kernel<<<blocks, threads>>>(chunk_size, feats, feat_dim, ldf,
@@ -318,9 +330,11 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
   }
 
   {
-    int threads =
-        (feat_dim + 31) / 32 * 32;       // round up to the nearest warp size
-    if (threads > 1024) threads = 1024;  // Max block size is 1024 threads
+    int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                  GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+    if (threads > GPU_MAX_THREADS_PER_BLOCK)
+      threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                            // GPU_MAX_THREADS_PER_BLOCK threads
     dim3 blocks(stash_size, num_lanes);
 
     // Then we need to copy feats from source into stash
@@ -502,8 +516,9 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp,
 void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A,
                                  int32_t lda, int32_t stridea,
                                  const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(32, 32);
-  int block = (n + 31) / 32;  // blocks in x and y dimensions
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  int block =
+      (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
   dim3 blocks(block, block, num_lanes);
 
   batched_convert_sp_to_dense_kernel<<<blocks, threads>>>(
@@ -579,7 +594,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma,
                          int32_t strideg, float *X, int32_t ldx,
                          int32_t stridex, const LaneDesc *lanes,
                          int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t blocks = num_lanes;
 
   initialize_channels_kernel<<<blocks, threads>>>(
@@ -624,7 +639,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma,
                             int32_t ldx, int32_t stridex, float *X_stash,
                             int32_t lds, int32_t strides, const LaneDesc *lanes,
                             int32_t num_lanes) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t blocks = num_lanes;
 
   apply_and_update_stash_kernel<<<blocks, threads>>>(
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 538e268dd98..1699f8c1e77 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -15,6 +15,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include "hipify.h"
+// The BLAS enumerators are used instead of the SOLVER ones.
+#ifdef CUBLAS_FILL_MODE_LOWER
+#undef CUBLAS_FILL_MODE_LOWER
+#endif
+#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#ifdef CUDA_R_32F
+#undef CUDA_R_32F
+#endif
+#define CUDA_R_32F HIPBLAS_R_32F
+#endif
+
 #include "cudafeat/feature-online-batched-ivector-cuda.h"
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index c43adaccc2e..bc06ea32d69 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -17,8 +17,16 @@
 
 #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include <hipcub/hipcub.hpp>
+
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include "cudafeat/lane-desc.h"
 #include "cudamatrix/cu-rand.h"
@@ -62,7 +70,7 @@ __global__ void batched_mel_banks_compute_kernel(
   // perfom local sum
   float sum = 0;
   if (frame < num_frames) {  // exclude frames beyond the end
-    for (int idx = tid; idx < size; idx += 32) {
+    for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
       sum += v[idx] * w[idx];
     }
   }
@@ -481,7 +489,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes,
                             float energy_floor, int32 *offsets, int32 *sizes,
                             float **vecs, const float *feats, int32_t ldf,
                             float *mels, int32_t ldm, bool use_log) {
-  dim3 Bl(32, 8);
+  dim3 Bl(GPU_WARP_SIZE, 8);
   dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes);
   batched_mel_banks_compute_kernel<<<Gr, Bl>>>(
       lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs,
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index e4549c7177c..d18f5237e8f 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -19,8 +19,14 @@
 #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipfft/hipfft.h>
+
+#include "hipify.h"
+#else
 #include <cufft.h>
 #endif
+#endif
 
 #include "cudafeat/feature-spectral-cuda.h"
 #include "cudafeat/feature-window-cuda.h"
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index ba13b4fe484..e432fe56573 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -15,11 +15,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
+#include <hipcub/hipcub.hpp>
+
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
+
 #include "cudafeat/feature-online-cmvn-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 
+// HIP builds do not required packed floating point operators definition.
+#ifndef __IS_HIP_COMPILE__
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   float2 retval;
   retval.x = a.x - b.x;
@@ -32,6 +42,7 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   retval.y = a.y + b.y;
   return retval;
 }
+#endif
 
 #if __CUDA_ARCH__ == 750
 __launch_bounds__ (1024, 1)
@@ -179,8 +190,9 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
       stats.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 
-  threads = (feat_dim + 31) / 32 * 32;  // round up to 32 threads
-  if (threads > 1024) threads = 1024;
+  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+            GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
+  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;
 
   const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
   const CuMatrix<float> &sstats = cmvn_state_.speaker_cmvn_stats;
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index 3912661c4fd..7b514010562 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -17,8 +17,16 @@
 
 #include "cudafeat/feature-spectral-cuda.h"
 
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include <hipcub/hipcub.hpp>
+
+#include "hipify.h"
+#else
+#include <nvtx3/nvToolsExt.h>
 #include <cub/cub.cuh>
+#endif
 
 #include "cudamatrix/cu-rand.h"
 
@@ -128,7 +136,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
 
   // perfom local sum
   float sum = 0;
-  for (int idx = tid; idx < size; idx += 32) {
+  for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
     sum += v[idx] * w[idx];
   }
 
@@ -487,7 +495,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w
   // mel banks
   int num_bins = bin_size_;
   cu_mel_energies_.Resize(num_frames, num_bins, kUndefined);
-  dim3 mel_threads(32, 8);
+  dim3 mel_threads(GPU_WARP_SIZE, 8);
   dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y);
   mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
       num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index 8683372098c..b0e4a24c8d2 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -19,8 +19,14 @@
 #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hipfft/hipfft.h>
+
+#include "hipify.h"
+#else
 #include <cufft.h>
 #endif
+#endif
 
 #include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index b8db5bd46d3..e001eb0790f 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -17,7 +17,13 @@
 
 #include "cudafeat/feature-window-cuda.h"
 
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 #include "matrix/matrix-functions.h"
 
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index 981345404f5..e03fda01ca7 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -20,7 +20,13 @@
 
 #include "cudafeat/online-batched-feature-pipeline-cuda.h"
 
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
 
 namespace kaldi {
 
@@ -95,7 +101,8 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   current_samples_stash_ = new int32_t[num_channels_];
 
   // allocated pinned memory for storing channel desc
-  CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_));
+  CU_SAFE_CALL(
+      cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
 
   // allocate device memory
   lanes_ =
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h
index fa000f03b62..6c588c40c24 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.h
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h
@@ -23,6 +23,10 @@
 #include <string>
 #include <vector>
 
+#ifdef __IS_HIP_COMPILE__
+#include "hipify.h"
+#endif
+
 #include "base/kaldi-error.h"
 #include "feat/feature-window.h"
 #include "matrix/matrix-lib.h"
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index 12d9b071f59..b7128dec7e6 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -15,22 +15,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef __IS_HIP_COMPILE__
+#include <hipcub/hipcub.hpp>
+
+#include "hipify.h"
+#else
 #include <cub/cub.cuh>
+#endif
+
 #include "cudafeat/online-ivector-feature-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
 
-// Meant to be called with blockDim= 32x32
+// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK
 __global__ void batched_gemv_reduce_kernel(int rows, int cols,
                                            const float* __restrict__ A, int lda,
                                            const float* __restrict__ X, int ldx,
                                            float* C) {
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
-  // Allocate WarpReduce shared memory for 32 warps
-  __shared__ typename WarpReduce::TempStorage temp_storage[32];
+  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
+  __shared__
+      typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
 
-  __shared__ float s_A[32][32 + 1];  //+1 to avoid bank conflicts on transpose
+  __shared__ float
+      s_A[GPU_MAX_WARPS_PER_BLOCK]
+         [GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
 
   int bid = blockIdx.x;   // batch id
   int tid = threadIdx.x;  // thread id
@@ -41,13 +51,15 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Offset to input vector to starting column for batch
   const float* __restrict__ X_in = X + bid * ldx;
 
-  for (int i = 0; i < cols; i += 32) {  // threadIdx.x, keep all threads present
+  for (int i = 0; i < cols;
+       i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
     int c = i + tid;
 
     float sum = 0.0f;
     // Perform dot product
     for (int j = 0; j < rows;
-         j += 32) {  // threadIdx.y, keep all threads present
+         j +=
+         GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
       int r = j + wid;
 
       float val = 0.0f;
@@ -133,9 +145,11 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
                                                     int32_t lda, float scale,
                                                     float* retval) {
   // Specialize WarpReduce for type float
-  typedef cub::BlockReduce<float, 32, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 32>
+  typedef cub::BlockReduce<float, GPU_WARP_SIZE,
+                           cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                           GPU_MAX_WARPS_PER_BLOCK>
       BlockReduce;
-  // Allocate WarpReduce shared memory for 32 warps
+  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   float sum = 0.0f;
@@ -201,7 +215,8 @@ __global__ void update_linear_and_quadratic_terms_kernel(
 void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
                          const float* AT, int B_stride, const float* B,
                          float* C) {
-  batched_gemv_reduce_kernel<<<batch_size, dim3(32, 32)>>>(
+  batched_gemv_reduce_kernel<<<batch_size,
+                               dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
       rows, cols, AT, A_stride, B, B_stride, C);
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -209,8 +224,11 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
 void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
                      int32_t size, const float* feats, int32_t ldf,
                      float* sfeats, int32_t lds) {
-  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
-  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
+                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
+  if (threads > GPU_MAX_THREADS_PER_BLOCK)
+    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
+                                          // GPU_MAX_THREADS_PER_BLOCK threads
 
   splice_features_kernel<<<num_frames, threads>>>(
       num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
@@ -232,7 +250,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames,
 void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
                                   float* A, int32_t lda, float scale,
                                   float* sum) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
@@ -243,7 +261,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
 
 void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats,
                    int32_t ldf, float* feats_sq, int32_t lds) {
-  dim3 threads(32, 32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index bd4964860e0..daf1c7dfbf9 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -16,8 +16,20 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#include <nvToolsExt.h>
+#ifdef __IS_HIP_COMPILE__
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+// The BLAS enumerators are used instead of the SOLVER ones.
+#ifdef CUBLAS_FILL_MODE_LOWER
+#undef CUBLAS_FILL_MODE_LOWER
+#endif
+#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
+#else
+#include <nvtx3/nvToolsExt.h>
 #endif
+#endif
+
 #include <iostream>
 
 #include "base/io-funcs.h"
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
index 9dbb5d30fa1..ed1c413c939 100644
--- a/src/cudafeatbin/Makefile
+++ b/src/cudafeatbin/Makefile
@@ -3,12 +3,14 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifndef CUDA_ARCH
-  $(error CUDA_ARCH is undefined, run 'src/configure')
+ifeq ($(CUDA), true)
+  ifndef CUDA_ARCH
+    $(error CUDA_ARCH is undefined, run 'src/configure')
+  endif
 endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
index 24e7cbd4a70..44ef403f21a 100644
--- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
+++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
@@ -18,8 +18,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
index 36cfc4ad90c..ff9415b8f11 100644
--- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
index 99883f3114a..3fcc1aea659 100644
--- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
index 787aceeca0d..e3f2ed75d30 100644
--- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
@@ -16,8 +16,10 @@
 // limitations under the License.
 
 #if HAVE_CUDA
+#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+#endif
 #endif
 
 #include <string>
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
index b9135c3cee6..d54ba56be84 100644
--- a/src/cudafeatbin/compute-online-feats-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -16,7 +16,9 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#include <nvToolsExt.h>
+#ifndef __IS_HIP_COMPILE__
+#include <nvtx3/nvToolsExt.h>
+#endif
 #endif
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 45c2ba44fd7..45c10b78899 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
            cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
   OBJFILES += cu-kernels.o
 endif
 
@@ -27,8 +27,15 @@ ifeq ($(CUDA), true)
   endif
 endif
 
+ifeq ($(CUDA), true)
 # Implicit rule for kernel compilation,
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+endif
+
+ifeq ($(ROCM), true)
+%.o : %.cu
+	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
+endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index e438c604509..c4cceedca48 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -23,9 +23,16 @@
 
 #if HAVE_CUDA == 1
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index d7d65da806a..3edd9f1ca40 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -23,10 +23,18 @@
 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #endif
+#endif
 
 #include <map>
 #include <set>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 53de59fe4fc..b8c250c6771 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -28,7 +28,13 @@
 #include <algorithm>
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
+#endif
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-kernels.h"
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 53eccdd44c5..2a29338aeb1 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -22,8 +22,14 @@
 #include <vector>
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h
index 84f78f00a91..3db44bf4aa5 100644
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@@ -105,13 +105,12 @@ class CuArrayBase {
  protected:
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
-  CuArrayBase<T>(): data_(NULL), dim_(0) { }
+  CuArrayBase(): data_(NULL), dim_(0) { }
 
 
   T *data_;  ///< GPU data pointer (if GPU not available,
              ///< will point to CPU memory).
   MatrixIndexT dim_;     ///< dimension of the vector
-
 };
 
 /**
@@ -123,22 +122,21 @@ class CuArrayBase {
 template<typename T>
 class CuArray: public CuArrayBase<T> {
  public:
-
   /// Default constructor, initialized data_ to NULL and dim_ to 0 via
   /// constructor of CuArrayBase.
-  CuArray<T>() { }
+  CuArray() { }
 
   /// Constructor with memory initialisation.  resize_type may be kSetZero or
   /// kUndefined.
-  explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero)
+  explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero)
      { Resize(dim, resize_type); }
 
   /// Constructor from CPU-based int vector
-  explicit CuArray<T>(const std::vector<T> &src) { CopyFromVec(src); }
+  explicit CuArray(const std::vector<T> &src) { CopyFromVec(src); }
 
   /// Copy constructor.  We don't make this explicit because we want to be able
   /// to create a std::vector<CuArray>.
-  CuArray<T>(const CuArray<T> &src) { CopyFromArray(src); }
+  CuArray(const CuArray<T> &src) { CopyFromArray(src); }
 
   /// Destructor
   ~CuArray() { Destroy(); }
@@ -172,7 +170,6 @@ class CuArray: public CuArrayBase<T> {
   /// I/O
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &is, bool binary) const;
-
 };
 
 
@@ -182,7 +179,7 @@ class CuSubArray: public CuArrayBase<T> {
   /// Constructor as a range of an existing CuArray or CuSubArray.  Note: like
   /// similar constructors in class CuVector and others, it can be used to evade
   /// 'const' constraints; don't do that.
-  explicit CuSubArray<T>(const CuArrayBase<T> &src,
+  explicit CuSubArray(const CuArrayBase<T> &src,
                          MatrixIndexT offset, MatrixIndexT dim);
 
   /// Construct from raw pointers
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index e0c64912207..63cf33f98b2 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -19,9 +19,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include <algorithm>
 #include "base/timer.h"
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 10fc00da681..938ec679f68 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -22,7 +22,15 @@
 
 #include "cudamatrix/cu-common.h"
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+
+#include "hipify.h"
+#define API_NAME_PREFIX "HIP"
+#else
 #include <cuda.h>
+#define API_NAME_PREFIX "CU"
+#endif
 
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-matrixdim.h"
@@ -31,6 +39,9 @@ namespace kaldi {
 
 #ifdef USE_NVTX
 NvtxTracer::NvtxTracer(const char* name) {
+#ifdef __IS_HIP_COMPILE__
+  roctxRangePushA(name);
+#else
   const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
   const int num_colors = sizeof(colors)/sizeof(uint32_t);
   int color_id = ((int)name[0])%num_colors;
@@ -43,9 +54,14 @@ NvtxTracer::NvtxTracer(const char* name) {
 	eventAttrib.message.ascii = name;
 	nvtxRangePushEx(&eventAttrib);
   // nvtxRangePushA(name);
+#endif
 }
 NvtxTracer::~NvtxTracer() {
-  nvtxRangePop();
+#ifdef __IS_HIP_COMPILE__
+  roctxRangePop();
+#else
+        nvtxRangePop();
+#endif
 }
 #endif
 
@@ -87,61 +103,106 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
 const char* cublasGetStatusStringK(cublasStatus_t status) {
   // Defined in CUDA include file: cublas.h or cublas_api.h
   switch(status) {
-    case CUBLAS_STATUS_SUCCESS:           return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:   return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:      return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:     return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:     return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:     return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:  return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:    return "CUBLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:     return "CUBLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:     return "CUBLAS_STATUS_LICENSE_ERROR";
+    case CUBLAS_STATUS_SUCCESS:
+      return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
+#ifdef __IS_HIP_COMPILE__
+    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:
+      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+    case HIPBLAS_STATUS_INVALID_ENUM:
+      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
+#endif
   }
-  return "CUBLAS_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR";
 }
 
 const char* cusparseGetStatusString(cusparseStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust
   // Defined in CUDA include file: cusparse.h
   switch(status) {
-    case CUSPARSE_STATUS_SUCCESS:                   return "CUSPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED:           return "CUSPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED:              return "CUSPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE:             return "CUSPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH:             return "CUSPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR:             return "CUSPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED:          return "CUSPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR:            return "CUSPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT:                return "CUSPARSE_STATUS_ZERO_PIVOT";
-    #if CUDA_VERSION >= 11000
-    case CUSPARSE_STATUS_NOT_SUPPORTED:             return "CUSPARSE_STATUS_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
-    #endif
+    case CUSPARSE_STATUS_SUCCESS:
+      return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_ZERO_PIVOT:
+      return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
+#if CUDA_VERSION >= 11000
+    case CUSPARSE_STATUS_NOT_SUPPORTED:
+      return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:
+      return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
+#endif
   }
-  return "CUSPARSE_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR";
 }
 
 const char* curandGetStatusString(curandStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
   // Defined in CUDA include file: curand.h
   switch(status) {
-    case CURAND_STATUS_SUCCESS:                     return "CURAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:            return "CURAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:             return "CURAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:           return "CURAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:                  return "CURAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:                return "CURAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:              return "CURAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:         return "CURAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:       return "CURAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:               return "CURAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:              return "CURAND_STATUS_INTERNAL_ERROR";
+    case CURAND_STATUS_SUCCESS:
+      return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
+#ifdef __IS_HIP_COMPILE__
+    case HIPRAND_STATUS_NOT_IMPLEMENTED:
+      return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
+#endif
   }
-  return "CURAND_STATUS_UNKNOWN_ERROR";
+  return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
 }
 
 }  // namespace kaldi
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 83f8a39a8b9..f7f45b8043a 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -31,11 +31,25 @@
 
 #if HAVE_CUDA
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+#include <roctracer/roctx.h>
+
+#include "hipify.h"
+#else
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
+
+#define GPU_WARP_SIZE 32
+#define GPU_MAX_THREADS_PER_BLOCK 1024
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
+#endif
 
 #define CU_SAFE_CALL(fun) \
 { \
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index be02921169d..bb4017de9bb 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -19,9 +19,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 39bcf373ace..fd2c0c64f1f 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,10 +23,17 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-
+#endif  // __IS_HIP_COMPILE__
 #include <string>
 #include <vector>
 #include <algorithm>
@@ -240,8 +247,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Suggest to use compute exclusive mode
+#ifdef __IS_HIP_COMPILE__
+    KALDI_WARN << "Not in compute-exclusive mode.";
+#else
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
+#endif
     // We want to choose the device more carefully, so release the CUDA context.
     e = cudaDeviceReset();
     if (e != cudaSuccess) {
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 2f278eb85b9..fe8ac795560 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -28,14 +28,27 @@
 #include <string>
 #include <iostream>
 
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+
+#include "hipify.h"
+#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-
+#endif
 #if CUDA_VERSION >= 9010
+#ifdef __IS_HIP_COMPILE__
+#include <hipsolver/hipsolver.h>
+#else
 #include <cusolverDn.h>
+#endif
 #else
 // cusolver not supported.
 // Setting a few types to minimize compiler guards.
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8044ff699bc..b3c3165bd96 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -27,11 +27,23 @@
 
 #include <cfloat>
 #include <limits>
+#ifdef __IS_HIP_COMPILE__
+#define __CUDA_ARCH__ 800
+#include <hip/hip_math_constants.h>
+#include <hip/hip_runtime.h>
+
+#include <hipcub/hipcub.hpp>
+#include <hipcub/block/block_reduce.hpp>
+
+#include "cudamatrix/cu-kernels-ansi.h"
+#include "hipify.h"
+#else
 #include <math_constants.h>
+#include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
-
+#endif            //__IS_HIP_COMPILE__
 
 /***********************************************************************
  * Generic __device__ functions
@@ -953,11 +965,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   }
 
   // Warp reduce. Implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
+    __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1109,8 +1122,8 @@ void trace_mat_mat_trans_atomic(Real *d_result,
                                 cudaStream_t stream) {
   // Assuming *d_result is set to zero already
 
-  constexpr int THREADS_X = 32;
-  constexpr int THREADS_Y = 16;
+  constexpr int THREADS_X = GPU_WARP_SIZE;
+  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2;
 
   dim3 thrds(THREADS_X, THREADS_Y);
 
@@ -1167,6 +1180,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
+      __syncwarp();
     }
   }
 
@@ -1206,11 +1220,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+     if (tid < warpSize) {
+       ssum[tid] += ssum[tid + shift];
+     }
+     __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1257,12 +1272,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
+      if (tid < warpSize) {
+	ssum[tid] += ssum[tid + shift];
+      }
+      __syncwarp();
     }
-  }
 
   // output TileDim sums per thread block
   if (tid < TileDim) {
@@ -1340,13 +1356,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
+  for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
+    if (tid < warpSize) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
+    __syncwarp();
   }
-
   // output TileDim sums per thread block
   if (tid < TileDim && j_n < dim_N.cols) {
     v[j_n] = alpha * smem.sum[tid] + beta * v[j_n];
@@ -1793,10 +1809,11 @@ static void _vec_transform_reduce(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
     }
+    __syncwarp();
   }
 
   // Output to vector result.
@@ -2006,9 +2023,11 @@ static void _transform_reduce_mat_rows(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    }
+    __syncwarp();
   }
 
   // Output to vector result.
@@ -2045,11 +2064,13 @@ static void _transform_reduce_mat_cols(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    }    
+    __syncwarp();
   }
-
+  
   // Output to vector result.
   if (tid == 0) {
     result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2087,13 +2108,12 @@ static void _group_transform_reduce(
       x_idx += threads_per_group;
     }
     sreduction[tid] = treduction;
-    if (threads_per_group > warpSize) {
-      __syncthreads();
-    }
+    __syncthreads();
 
     // tree-reduce to 2x warpSize elements per group
-#   pragma unroll
-    for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) {
+    int shift = threads_per_group / 2;
+#pragma unroll
+    for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
@@ -2101,14 +2121,12 @@ static void _group_transform_reduce(
     }
 
     // Warp-reduce to 1 element per group.
-    // Threads implicitly synchronized within the warp.
-    const int warp_reduce_size =
-        threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
-    if (threadIdx.x < warp_reduce_size) {
 #     pragma unroll
-      for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
+    for (; shift > 0; shift >>= 1) {
+      if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
+      __syncwarp();
     }
 
     // Store the result.
@@ -2967,12 +2985,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
   }
 
   // reduce to 1 element per row
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sprod[tid] += sprod[tid + shift];
       snorm[tid] += snorm[tid + shift];
     }
+    __syncwarp();
   }
 
   // broadcast the sum results
@@ -3254,15 +3273,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
   }
   // Warp reduce without __syncthreads()
   // (note.: synchronizes implicitly within a warp at the multiprocessor)
-  if (tid < warpSize / 2) {
 #pragma unroll
-    for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
-        num_working_threads >>= 1) {
+  for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
+      num_working_threads >>= 1) {
+    if (tid < warpSize / 2) {
       if (smax[tid + num_working_threads] > smax[tid]) {
         smax[tid] = smax[tid + num_working_threads];
         sidx[tid] = sidx[tid + num_working_threads];
       }
     }
+    __syncwarp();
   }
 
   if (tid == 0) {
@@ -3990,9 +4010,9 @@ struct  BatchedMatrixCopyDesc {
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
-// launched with a block size of 32x32 (32 rows, 32 cols per CTA)
-// grid dim x,y expands to fill out average in x/y across batches
-// grid dim.z is batch
+// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE
+// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y
+// expands to fill out average in x/y across batches grid dim.z is batch
 template<typename Real>
 __global__ 
 void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
@@ -4371,7 +4391,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B,
 
 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
                          MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
 }
 
 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
@@ -4392,6 +4412,11 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MTN<64>
+        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
+#endif
   }
 }
 
@@ -4402,7 +4427,11 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
+#endif
   }
 }
 
@@ -5077,7 +5106,7 @@ void cudaD_trace_mat_mat_trans(const double* A,
 
 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
                          MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
+  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
 }
 
 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
@@ -5098,6 +5127,11 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MTN<64>
+        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
+#endif
   }
 }
 
@@ -5108,7 +5142,11 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
+    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
+#ifdef __IS_HIP_COMPILE__
+  } else if (Bl.x == 64) {
+    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
+#endif
   }
 }
 
@@ -5479,25 +5517,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
@@ -5793,7 +5831,15 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 // Launches a kernel that does nothing, explicitly using the legacy default stream;
 // this will synchronize all threads without blocking.
 void cuda_legacy_noop() {
+#ifdef __IS_HIP_COMPILE__
+  // HIP doesn't currently support cudaStreamLegacy stream so we force the
+  // implementation to use the legacy (not per-thread) API to get similar
+  // semantics.
+  auto k = reinterpret_cast<void*>(_noop_kernel);
+  hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0);
+#else
   _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
+#endif
 }
 
 void cudaF_mat_copy_range_clamped(
@@ -5803,8 +5849,10 @@ void cudaF_mat_copy_range_clamped(
    float *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(32,32);
-  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks(
+      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5817,8 +5865,10 @@ void cudaD_mat_copy_range_clamped(
    double *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(32,32);
-  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 blocks(
+      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
 
   _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5827,8 +5877,7 @@ void cudaD_mat_copy_range_clamped(
 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
     int32_t *ldo) {
-
-  dim3 threads(32,32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<float> batch_desc; 
@@ -5854,9 +5903,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
-                  MAX_BATCH_SIZE);
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5876,10 +5926,11 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
-      
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
-                  remaining);
+
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5892,8 +5943,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
     int32_t *ldo) {
-
-  dim3 threads(32,32);
+  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<double> batch_desc; 
@@ -5919,9 +5969,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
-                  MAX_BATCH_SIZE);
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5942,10 +5993,11 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
 
-      dim3 blocks((cols + 31) / 32,
-                  (rows + 31) / 32, 
-                  remaining);
-      
+      dim3 blocks(
+          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
+          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
+          remaining);
+
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
 
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 3fbeff3a470..d0d8e4e771f 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
 //    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
 //                 n_blocks(num_rows, dimBlock.y));
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index be8483e48f5..dfcaf30770a 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2705,7 +2705,7 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
-        KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
+        KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index c67842d38bf..53831a52bc8 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -27,9 +27,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
@@ -243,7 +250,7 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       } else {
         // 2D thread block with warps (blockDim.x) along the row-dim of input M.
         // Each (8x32) thread block will transpose (32x32) data
-        const int32 warpSize = 32;
+        const int32 warpSize = GPU_WARP_SIZE;
         dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
         dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
             n_blocks(M.NumRows(), warpSize));
@@ -849,7 +856,7 @@ void CuMatrixBase<Real>::DiffGroupPnorm(const CuMatrixBase<Real> &in_value,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
                  n_blocks(NumRows(), dimBlock.y));
@@ -999,7 +1006,7 @@ void CuMatrixBase<Real>::AddSmat(Real alpha, const CuSparseMatrix<Real> &A,
     // We use warpSize threads per row to access only the nonzero elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of A.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y));
 
@@ -2176,7 +2183,7 @@ Real TraceMatMat(const CuMatrixBase<Real> &A,
     // if the matrix is not in a very bad shape.
     // (wider or taller than 32x8192)
     // CPU will then reduce to 1 element.
-    const int kWarpSize = 32;
+    const int kWarpSize = GPU_WARP_SIZE;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
         n_blocks(A.NumRows(), kWarpSize));
@@ -2398,7 +2405,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
       // and use transposed copy to fill *this
       // see CuMatrixBase<Real>::CopyFromMat() for more detail of the impl
       MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
-      const int32 warpSize = 32;
+      const int32 warpSize = GPU_WARP_SIZE;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(rv_dim.cols, warpSize),
                    n_blocks(rv_dim.rows, warpSize));
@@ -2408,7 +2415,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
     } else if (rv.Dim() == num_rows_) {
       // use 2D block (8x32) and large enough grid to cover matrix *this
       // dimBlock.x need to be at least warpSize for coalesced memory access.
-      const int32 warpSize = 32;
+      const int32 warpSize = GPU_WARP_SIZE;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
                    n_blocks(num_rows_, dimBlock.y));
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index a531ecd45b9..775fecd82c6 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -231,7 +231,7 @@ class CuMatrixBase {
   bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
 
   /// Get size of matrix in bytes
-  MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
+  size_t SizeInBytes() const { return static_cast<size_t>(num_rows_)*static_cast<size_t>(stride_)*sizeof(Real); }
 
   // Copy functions.  These do not resize.
   template<typename OtherReal>
@@ -250,7 +250,7 @@ class CuMatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
-  
+
   // This function will copy from source rows (start_range, end_range]
   // if the range is outside of the clamped region then the clamped
   // row will be replicated across the out of range areas
@@ -307,9 +307,9 @@ class CuMatrixBase {
   void PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign=false);
 
   void Floor(const CuMatrixBase<Real> &src, Real floor_val);
-  
+
   void Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val);
-  
+
   /// This is equivalent to running:
   /// Floor(src, lower_limit);
   /// Ceiling(src, upper_limit);
@@ -320,7 +320,7 @@ class CuMatrixBase {
   /// (x < 0 ? exp(x) : x + 1).  This function is used
   /// in our RNNLM training.
   void ExpSpecial(const CuMatrixBase<Real> &src);
-  
+
   /// Softmax nonlinearity
   /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
   /// with attention to avoiding  overflow or underflow.
@@ -333,7 +333,7 @@ class CuMatrixBase {
   /// Supports in-place operation (i.e. this == &src).
   void LogSoftMaxPerRow(const CuMatrixBase<Real> &src);
 
-  
+
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -439,23 +439,23 @@ class CuMatrixBase {
     this -> Pow(*this, power);
   };
 
-  
+
   inline void ApplyPowAbs(Real power, bool include_sign=false) {
     this -> PowAbs(*this, power, include_sign);
   };
-  
+
   inline void ApplyHeaviside() {
     this -> Heaviside(*this);
   };
-  
+
   inline void ApplyFloor(Real floor_val) {
     this -> Floor(*this, floor_val);
   };
-  
+
   inline void ApplyCeiling(Real ceiling_val) {
     this -> Ceiling(*this, ceiling_val);
   };
-  
+
   inline void ApplyExp() {
     this -> Exp(*this);
   };
@@ -670,13 +670,13 @@ class CuMatrixBase {
   inline const CuSubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+    return CuSubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   inline CuSubVector<Real> Row(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return CuSubVector<Real>(data_ + (i * stride_), NumCols());
+    return CuSubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   inline CuValue<Real> operator() (MatrixIndexT r, MatrixIndexT c) {
@@ -684,7 +684,7 @@ class CuMatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return CuValue<Real>(data_ + r * stride_ + c);
+    return CuValue<Real>(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);
   }
 
   inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
@@ -692,7 +692,7 @@ class CuMatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return CuValue<Real>(data_ + r * stride_ + c);  // will be casted to Real.
+    return CuValue<Real>(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);  // will be casted to Real.
   }
 
   Real Sum() const;
@@ -737,10 +737,10 @@ class CuMatrixBase {
 
   /// Get raw row pointer (const).  Warning: may return a pointer to GPU memory.  Use at
   /// your own risk.
-  inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; }
+  inline const Real* RowData(MatrixIndexT r) const { return data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_); }
   /// Get raw row pointer.  Warning: may return a pointer to GPU memory.  Use at
   /// your own risk.
-  inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; }
+  inline Real* RowData(MatrixIndexT r) { return data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_); }
   /// Return data pointer (const).  Warning: may return a pointer to GPU memory.
   /// Use at your own risk.
   inline const Real *Data() const { return data_; }
@@ -924,7 +924,7 @@ class CuSubMatrix: public CuMatrixBase<Real> {
 
   /// This type of constructor is needed for Range() to work [in CuMatrix base
   /// class]. Cannot make it explicit or that breaks.
-  inline CuSubMatrix<Real> (const CuSubMatrix &other):
+  inline CuSubMatrix(const CuSubMatrix &other):
   CuMatrixBase<Real> (other.data_, other.num_rows_, other.num_cols_,
                       other.stride_) {}
  private:
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 756d580c7cf..001170fdeca 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -21,9 +21,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index d1efc0cff9c..96085848d72 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -19,9 +19,16 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index aad34b5dd54..0c2230a8731 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() {
 template <typename Real>
 static void UnitTestCuSparseMatrixTraceMatSmat() {
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT row = 10 + Rand() % 40;
-    MatrixIndexT col = 10 + Rand() % 50;
+    MatrixIndexT row = 2 + Rand() % 3;
+    MatrixIndexT col = 1 + Rand() % 4;
 
     CuMatrix<Real> mat1(row, col);
     CuMatrix<Real> mat2(col, row);
@@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() {
     cu_smat2.CopyToMat(&mat2);
 
     Real trace1 = TraceMatMat(mat3, mat1, kTrans);
+
     Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans);
     AssertEqual(trace1, trace2, 0.00001);
 
     trace1 = TraceMatMat(mat3, mat2, kNoTrans);
     trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans);
+
     AssertEqual(trace1, trace2, 0.00001);
   }
 }
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 703aa40e735..81ecbe68080 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -22,9 +22,16 @@
 
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include <utility>
 #include <vector>
@@ -138,7 +145,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all selected rows.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y));
 
@@ -161,7 +168,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
 template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes, int32 dim,
                                      MatrixTransposeType trans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
     NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -194,8 +201,8 @@ template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes,
                                      const CuVectorBase<Real> &weights,
                                      int32 dim, MatrixTransposeType trans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
-    NULL) {
+  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), 
+  csr_val_(NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined);
@@ -266,8 +273,9 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = 0;
       num_cols_ = 0;
       nnz_ = 0;
-      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
           1 * sizeof(int)));
+      csr_col_idx_ = NULL;   // may be freed, but this is allowed.
       csr_val_ = NULL;
     } else {
       KALDI_ASSERT(num_rows > 0);
@@ -277,10 +285,16 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = num_rows;
       num_cols_ = num_cols;
       nnz_ = nnz;
-      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
-          (num_rows + 1 + nnz) * sizeof(int)));
-      csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int)));
+      if (nnz > 0) {
+	csr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+          nnz * sizeof(int)));
+	csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
           nnz * sizeof(Real)));
+      } else {
+	csr_col_idx_ = NULL;
+	csr_val_ = NULL;
+      }
       CuSubArray<int> row_ptr(CsrRowPtr(), NumRows() + 1);
       row_ptr.Set(nnz);
       if (resize_type == kSetZero) {
@@ -302,8 +316,11 @@ void CuSparseMatrix<Real>::Destroy() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    if (csr_row_ptr_col_idx_) {
-      CuDevice::Instantiate().Free(csr_row_ptr_col_idx_);
+    if (csr_row_ptr_) {
+      CuDevice::Instantiate().Free(csr_row_ptr_);
+    }
+    if (csr_col_idx_) {
+      CuDevice::Instantiate().Free(csr_col_idx_);
     }
     if (csr_val_) {
       CuDevice::Instantiate().Free(csr_val_);
@@ -311,7 +328,8 @@ void CuSparseMatrix<Real>::Destroy() {
     num_rows_ = 0;
     num_cols_ = 0;
     nnz_ = 0;
-    csr_row_ptr_col_idx_ = NULL;
+    csr_row_ptr_ = NULL;
+    csr_col_idx_ = NULL;    
     csr_val_ = NULL;
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -378,11 +396,17 @@ void CuSparseMatrix<Real>::CopyFromSmat(const CuSparseMatrix<Real>& smat,
       CuSubVector<Real> val_from(smat.CsrVal(), smat.NumElements());
       val_to.CopyFromVec(val_from);
 
-      CuSubArray<int> idx_to(csr_row_ptr_col_idx_,
-                             NumRows() + 1 + NumElements());
-      CuSubArray<int> idx_from(smat.csr_row_ptr_col_idx_,
-                               smat.NumRows() + 1 + smat.NumElements());
-      idx_to.CopyFromArray(idx_from);
+      {
+	CuSubArray<int> idx_to(csr_row_ptr_, NumRows() + 1);
+	CuSubArray<int> idx_from(smat.csr_row_ptr_, NumRows() + 1);
+	idx_to.CopyFromArray(idx_from);
+      }
+
+      {
+	CuSubArray<int> idx_to(csr_col_idx_, NumElements());
+	CuSubArray<int> idx_from(smat.csr_col_idx_, NumElements());
+	idx_to.CopyFromArray(idx_from);
+      }
 
     } else {
       Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined);
@@ -413,9 +437,14 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
       smat->Resize(0, 0);
       return;
     }
-    CuSubArray<int> idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements());
-    std::vector<int> idx_cpu;
-    idx.CopyToVec(&idx_cpu);
+    CuSubArray<int> row_ptr(csr_row_ptr_, NumRows() + 1);
+    std::vector<int> row_ptr_cpu;
+    row_ptr.CopyToVec(&row_ptr_cpu);
+
+
+    CuSubArray<int> col_idx(csr_col_idx_, NumElements());
+    std::vector<int> col_idx_cpu;
+    col_idx.CopyToVec(&col_idx_cpu);
 
     CuSubVector<Real> val(CsrVal(), NumElements());
     Vector<OtherReal> val_cpu(NumElements(), kUndefined);
@@ -425,8 +454,8 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
         NumRows());
     int n = 0;
     for (int i = 0; i < NumRows(); ++i) {
-      for (; n < idx_cpu[i + 1]; ++n) {
-        const MatrixIndexT j = idx_cpu[NumRows() + 1 + n];
+      for (; n < row_ptr_cpu[i + 1]; ++n) {
+        const MatrixIndexT j = col_idx_cpu[n];
         pairs[i].push_back( { j, val_cpu(n) });
       }
     }
@@ -484,7 +513,8 @@ void CuSparseMatrix<Real>::Swap(CuSparseMatrix<Real> *smat) {
     std::swap(num_rows_, smat->num_rows_);
     std::swap(num_cols_, smat->num_cols_);
     std::swap(nnz_, smat->nnz_);
-    std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_);
+    std::swap(csr_row_ptr_, smat->csr_row_ptr_);
+    std::swap(csr_col_idx_, smat->csr_col_idx_);
     std::swap(csr_val_, smat->csr_val_);
   } else
 #endif
@@ -548,7 +578,7 @@ Real TraceMatSmat(const CuMatrixBase<Real> &A,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of B.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y));
 
@@ -638,7 +668,7 @@ void CuSparseMatrix<Real>::CopyToMat(CuMatrixBase<OtherReal> *M,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows.
-    const int warpSize = 32;
+    const int warpSize = GPU_WARP_SIZE;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(NumRows(), dimBlock.y));
 
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index 82b17a0dc71..180beed6183 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -121,13 +121,13 @@ class CuSparseMatrix {
 
   /// Default constructor
   CuSparseMatrix() :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
           NULL) {
   }
 
   /// Constructor from CPU-based sparse matrix.
   explicit CuSparseMatrix(const SparseMatrix<Real> &smat) :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat);
   }
@@ -135,7 +135,7 @@ class CuSparseMatrix {
   /// Constructor from GPU-based sparse matrix (supports transposition).
   CuSparseMatrix(const CuSparseMatrix<Real> &smat, MatrixTransposeType trans =
                      kNoTrans) :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat, trans);
   }
@@ -200,19 +200,19 @@ class CuSparseMatrix {
   /// indices of the first nonzero element in the i-th row, while the last entry
   /// contains nnz_, as zero-based CSR format is used.
   const int* CsrRowPtr() const {
-    return csr_row_ptr_col_idx_;
+    return csr_row_ptr_;
   }
   int* CsrRowPtr() {
-    return csr_row_ptr_col_idx_;
+    return csr_row_ptr_;
   }
 
   /// Returns pointer to the integer array of length nnz_ that contains
   /// the column indices of the corresponding elements in array CsrVal()
   const int* CsrColIdx() const {
-    return csr_row_ptr_col_idx_ + num_rows_ + 1;
+    return csr_col_idx_;
   }
   int* CsrColIdx() {
-    return csr_row_ptr_col_idx_ + num_rows_ + 1;
+    return csr_col_idx_;
   }
 
 private:
@@ -238,9 +238,10 @@ class CuSparseMatrix {
   // number of non-zeros
   MatrixIndexT nnz_;
 
-  // csr row ptrs and col indices in a single int array
-  // of the length (num_rows_ + 1 + nnz_)
-  int* csr_row_ptr_col_idx_;
+  // length num_rows_ + 1
+  int* csr_row_ptr_;
+  // length nnz_
+  int* csr_col_idx_;
 
   // csr value array of the length nnz_
   Real* csr_val_;
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index 377c34239f0..da19a31b39a 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -19,9 +19,16 @@
 // limitations under the License.
 
 #if HAVE_CUDA==1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h
index 8de46ec46f5..4219467f615 100644
--- a/src/cudamatrix/cu-tp-matrix.h
+++ b/src/cudamatrix/cu-tp-matrix.h
@@ -48,18 +48,18 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   CuTpMatrix() : CuPackedMatrix<Real>() {}
   explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
       : CuPackedMatrix<Real>(r, resize_type) {}
-  
-  explicit CuTpMatrix<Real>(const TpMatrix<Real> &orig)
+
+  explicit CuTpMatrix(const TpMatrix<Real> &orig)
       : CuPackedMatrix<Real>(orig) {}
   // This constructor lacks the "explicit" keyword so that
   // we can include this class in std::vector.
-  CuTpMatrix<Real>(const CuTpMatrix<Real> &orig)
+  CuTpMatrix(const CuTpMatrix<Real> &orig)
       : CuPackedMatrix<Real>(orig) {}
-  
-  explicit CuTpMatrix<Real>(const CuMatrixBase<Real> &orig,
+
+  explicit CuTpMatrix(const CuMatrixBase<Real> &orig,
                             MatrixTransposeType trans = kNoTrans);
 
-  
+
   ~CuTpMatrix() {}
 
   void CopyFromMat(const CuMatrixBase<Real> &M,
@@ -70,12 +70,12 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   }
   void CopyFromTp(const TpMatrix<Real> &other) {
     CuPackedMatrix<Real>::CopyFromPacked(other);
-  }  
+  }
   void Cholesky(const CuSpMatrix<Real>& Orig);
   void Invert();
 
   CuTpMatrix<Real> &operator = (const CuTpMatrix<Real> &in);
-  
+
  protected:
   inline const TpMatrix<Real> &Mat() const {
     return *(reinterpret_cast<const TpMatrix<Real>* >(this));
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 8736782a3e0..6667f2bca62 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -22,9 +22,16 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
+#ifdef __IS_HIP_COMPILE__
+#include <hip/hip_runtime_api.h>
+#include <hipblas/hipblas.h>
+
+#include "hipify.h"
+#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
+#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
@@ -629,7 +636,10 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
                                   N.Data(), N.Stride(), beta, data_);
       } else {
         // Case 2: diag(M'*N) == sum(M.*N, 1)
-        // 16x16 or 8x32 2D block for coalesced memory access.
+        // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2
+        // or
+        // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE
+        // 2D block for coalesced memory access.
         // Grid shape is designed as follows,
         // 1. for small matrices, use 1D grid with only 1 row of 16x16 block,
         //    to avoid multiple kernel launch;
@@ -637,11 +647,12 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         //    use 1- or 2-D grid so that the grid contains
         //    at least and not much larger than 'kOptNumBlocks' blocks
         //    to fully utilize the GPU;
-        const int32 warpSize = 32;
+        const int32 warpSize = GPU_WARP_SIZE;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
-            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
-                16 : 32;
+            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize)
+                ? GPU_WARP_SIZE / 2
+                : GPU_WARP_SIZE;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
@@ -667,8 +678,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access.
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
-        int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
+        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
+                           ? GPU_WARP_SIZE
+                           : GPU_WARP_SIZE / 2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
@@ -676,8 +688,9 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
       } else {
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
-        int tile_dim =
-            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
+        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
+                           ? GPU_WARP_SIZE
+                           : GPU_WARP_SIZE / 2;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index f1c32756887..82e1fb47fcb 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -243,7 +243,7 @@ class CuVectorBase {
 
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
-  CuVectorBase<Real>(): data_(NULL), dim_(0) { }
+  CuVectorBase(): data_(NULL), dim_(0) { }
 
   Real *data_; ///< GPU data pointer (or regular data pointer
                ///< if CUDA is not compiled in or we have no GPU).
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index 63dbe630568..537cca9b97f 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -28,14 +28,18 @@
 namespace kaldi {
 #if HAVE_CUDA == 1
 
+#ifndef CUBLAS_R_32F
+#define CUBLAS_R_32F CUDA_R_32F
+#endif
 inline cublasStatus_t cublas_gemm(
     cublasHandle_t handle, cublasOperation_t transa,
     cublasOperation_t transb, int m, int n,int k, float alpha,
     const float *A, int lda, const float *B, int ldb, float beta,
     float *C, int ldc) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta,
-                      C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
+  return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F,
+                      lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc,
+                      CuDevice::Instantiate().GetCublasComputeType(),
                       CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
@@ -63,9 +67,11 @@ inline cublasStatus_t cublas_gemmBatched(
     const float *A[], int lda, const float *B[], int ldb, float beta,
     float *C[], int ldc, int batchCount) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F,  lda,
-                             (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount,
-                             CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo());
+  return cublasGemmBatchedEx(
+      handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F,
+      lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F,
+      ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(),
+      CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount);
 #endif
@@ -219,6 +225,7 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo,
 // cuSPARSE wrappers
 //
 #if CUDA_VERSION >= 10020
+#ifndef __IS_HIP_COMPILE__
 inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int nnz, const void *csrVal,
                                          const int *csrRowPtr,
@@ -243,6 +250,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
 
   return status;
 }
+#endif
 
 inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle,
                                        cusparseOperation_t transA, 
@@ -319,7 +327,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues,
 			  idxBase);
@@ -336,7 +344,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues,
                           idxBase);
diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc
index cc8712e854d..87378f93bbd 100644
--- a/src/decoder/lattice-simple-decoder.cc
+++ b/src/decoder/lattice-simple-decoder.cc
@@ -45,8 +45,8 @@ void LatticeSimpleDecoder::InitDecoding() {
 
 bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) {
   InitDecoding();
-  
-  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {  
+
+  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
     if (NumFramesDecoded() % config_.prune_interval == 0)
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
     ProcessEmitting(decodable);
@@ -57,7 +57,7 @@ bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) {
     ProcessNonemitting();
   }
   FinalizeDecoding();
-  
+
   // Returns true if we have any kind of traceback available (not necessarily
   // to the end state; query ReachedFinal() for that).
   return !final_costs_.empty();
@@ -88,9 +88,9 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst,
   if (decoding_finalized_ && !use_final_probs)
     KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
               << "GetRawLattice() with use_final_probs == false";
-  
+
   unordered_map<Token*, BaseFloat> final_costs_local;
-  
+
   const unordered_map<Token*, BaseFloat> &final_costs =
       (decoding_finalized_ ? final_costs_ : final_costs_local);
 
@@ -100,7 +100,7 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst,
   ofst->DeleteStates();
   int32 num_frames = NumFramesDecoded();
   KALDI_ASSERT(num_frames > 0);
-  const int32 bucket_count = num_toks_/2 + 3;  
+  const int32 bucket_count = num_toks_/2 + 3;
   unordered_map<Token*, StateId> tok_map(bucket_count);
   // First create all states.
   for (int32 f = 0; f <= num_frames; f++) {
@@ -169,10 +169,10 @@ bool LatticeSimpleDecoder::GetLattice(
   fst::ILabelCompare<LatticeArc> ilabel_comp;
   ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes
   // lattice-determinization more efficient.
-    
+
   fst::DeterminizeLatticePrunedOptions lat_opts;
   lat_opts.max_mem = config_.det_opts.max_mem;
-    
+
   DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
   raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed.
   Connect(ofst); // Remove unreachable states... there might be
@@ -196,7 +196,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken(
     bool emitting, bool *changed) {
   KALDI_ASSERT(frame < active_toks_.size());
   Token *&toks = active_toks_[frame].toks;
-    
+
   unordered_map<StateId, Token*>::iterator find_iter = cur_toks_.find(state);
   if (find_iter == cur_toks_.end()) { // no such token presently.
     // Create one.
@@ -221,7 +221,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken(
     return tok;
   }
 }
-  
+
 // delta is the amount by which the extra_costs must
 // change before it sets "extra_costs_changed" to true.  If delta is larger,
 // we'll tend to go back less far toward the beginning of the file.
@@ -242,7 +242,7 @@ void LatticeSimpleDecoder::PruneForwardLinks(
       warned_ = true;
     }
   }
-    
+
   bool changed = true;
   while (changed) {
     changed = false;
@@ -300,7 +300,7 @@ void LatticeSimpleDecoder::ComputeFinalCosts(
   BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
   BaseFloat best_cost = infinity,
       best_cost_with_final = infinity;
-  
+
   for (unordered_map<StateId, Token*>::const_iterator iter = cur_toks_.begin();
        iter != cur_toks_.end(); ++iter) {
     StateId state = iter->first;
@@ -336,19 +336,19 @@ void LatticeSimpleDecoder::ComputeFinalCosts(
 // on the final frame.  If there are final tokens active, it uses the final-probs
 // for pruning, otherwise it treats all tokens as final.
 void LatticeSimpleDecoder::PruneForwardLinksFinal() {
-  KALDI_ASSERT(!active_toks_.empty());  
+  KALDI_ASSERT(!active_toks_.empty());
   int32 frame_plus_one = active_toks_.size() - 1;
 
   if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen.
     KALDI_WARN << "No tokens alive at end of file\n";
 
-  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;  
+  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;
   ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
   decoding_finalized_ = true;
   // We're about to delete some of the tokens active on the final frame, so we
   // clear cur_toks_ because otherwise it would then contain dangling pointers.
   cur_toks_.clear();
-  
+
   // Now go through tokens on this frame, pruning forward links...  may have to
   // iterate a few times until there is no more change, because the list is not
   // in topological order.  This is a modified version of the code in
@@ -429,7 +429,7 @@ BaseFloat LatticeSimpleDecoder::FinalRelativeCost() const {
     return final_relative_cost_;
   }
 }
-  
+
 // Prune away any tokens on this frame that have no forward links. [we don't do
 // this in PruneForwardLinks because it would give us a problem with dangling
 // pointers].
@@ -453,14 +453,14 @@ void LatticeSimpleDecoder::PruneTokensForFrame(int32 frame) {
     }
   }
 }
-  
+
 // Go backwards through still-alive tokens, pruning them, starting not from
 // the current frame (where we want to keep all tokens) but from the frame before
 // that.  We go backwards through the frames and stop when we reach a point
 // where the delta-costs are not changing (and the delta controls when we consider
 // a cost to have "not changed").
 void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
-  int32 cur_frame_plus_one = NumFramesDecoded();  
+  int32 cur_frame_plus_one = NumFramesDecoded();
   int32 num_toks_begin = num_toks_;
   // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
   // one to get the corresponding index for the decodable object.
@@ -468,7 +468,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
     // Reason why we need to prune forward links in this situation:
     // (1) we have never pruned them
     // (2) we never pruned the forward links on the next frame, which
-    //     
+    //
     if (active_toks_[f].must_prune_forward_links) {
       bool extra_costs_changed = false, links_pruned = false;
       PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta);
@@ -478,7 +478,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
         active_toks_[f].must_prune_tokens = true;
       active_toks_[f].must_prune_forward_links = false;
     }
-    if (f+1 < cur_frame_plus_one && 
+    if (f+1 < cur_frame_plus_one &&
         active_toks_[f+1].must_prune_tokens) {
       PruneTokensForFrame(f+1);
       active_toks_[f+1].must_prune_tokens = false;
@@ -493,20 +493,20 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
 // (optionally) on the final frame.  Takes into account the final-prob of
 // tokens.  This function used to be called PruneActiveTokensFinal().
 void LatticeSimpleDecoder::FinalizeDecoding() {
-  int32 final_frame_plus_one = NumFramesDecoded();  
+  int32 final_frame_plus_one = NumFramesDecoded();
   int32 num_toks_begin = num_toks_;
   PruneForwardLinksFinal();
-  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {  
+  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {
     bool b1, b2; // values not used.
     BaseFloat dontcare = 0.0;
     PruneForwardLinks(f, &b1, &b2, dontcare);
     PruneTokensForFrame(f + 1);
   }
-  PruneTokensForFrame(0); 
+  PruneTokensForFrame(0);
   KALDI_VLOG(3) << "pruned tokens from " << num_toks_begin
                 << " to " << num_toks_;
 }
-  
+
 void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
   int32 frame = active_toks_.size() - 1; // frame is the frame-index
                                          // (zero-based) used to get likelihoods
@@ -538,9 +538,9 @@ void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
         // AddToken adds the next_tok to cur_toks_ (if not already present).
         Token *next_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                          true, NULL);
-          
+
         // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-        tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
+        tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel,
                                      graph_cost, ac_cost, tok->links);
       }
     }
@@ -553,7 +553,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
   // Note: "frame" is the time-index we just processed, or -1 if
   // we are processing the nonemitting transitions before the
   // first frame (called from InitDecoding()).
-    
+
   // Processes nonemitting arcs for one frame.  Propagates within
   // cur_toks_.  Note-- this queue structure is is not very optimal as
   // it may cause us to process states unnecessarily (e.g. more than once),
@@ -569,15 +569,9 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
       queue.push_back(state);
     best_cost = std::min(best_cost, iter->second->tot_cost);
   }
-  if (queue.empty()) {
-    if (!warned_) {
-      KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
-                << frame;
-      warned_ = true;
-    }
-  }
+
   BaseFloat cutoff = best_cost + config_.beam;
-    
+
   while (!queue.empty()) {
     StateId state = queue.back();
     queue.pop_back();
@@ -600,10 +594,10 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
           bool changed;
           Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                           false, &changed);
-          
+
           tok->links = new ForwardLink(new_tok, 0, arc.olabel,
                                        graph_cost, 0, tok->links);
-            
+
           // "changed" tells us whether the new token has a different
           // cost from before, or is new [if so, add into queue].
           if (changed && fst_.NumInputEpsilons(arc.nextstate) != 0)
@@ -662,5 +656,3 @@ void LatticeSimpleDecoder::PruneCurrentTokens(BaseFloat beam, unordered_map<Stat
 
 
 } // end namespace kaldi.
-
-
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index 16009714c57..2589c5c344e 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -23,6 +23,8 @@
 #include "util/kaldi-io.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst
 {
 using std::vector;
@@ -196,7 +198,7 @@ static void TestContextFst(bool verbose, bool use_matcher) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
         FstPrinter<Arc> fstprinter(*f, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
     }
 
@@ -224,7 +226,7 @@ static void TestContextFst(bool verbose, bool use_matcher) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
         FstPrinter<Arc> fstprinter(fst_composed, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
     }
 
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index 5e4f1812930..ae902021c7d 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -22,6 +22,8 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 using std::vector;
 using std::cout;
@@ -94,7 +96,7 @@ template<class Arc> void TestDeterminizeLattice() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
@@ -106,7 +108,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "FST after lattice-determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
       // OK, now determinize it a different way and check equivalence.
@@ -117,7 +119,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "Compact FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       if (kaldi::Rand() % 2 == 1)
         ConvertLattice<Weight, Int>(det_fst, &compact_det_fst, false);
@@ -128,7 +130,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "Compact version of determinized FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
@@ -149,14 +151,14 @@ template<class Arc> void TestDeterminizeLattice2() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLattice<TropicalWeight, int32>(*fst, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     delete fst;
   }
diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index e9650ca29a6..36c9ba397a6 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -725,7 +725,7 @@ void DeterminizerStar<F>::EpsilonClosure::
 
   {
     // this sorting is based on StateId
-    sort(ecinfo_.begin(), ecinfo_.end());
+    std::sort(ecinfo_.begin(), ecinfo_.end());
 
     output_subset->clear();
 
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index 272774b20aa..36568c5bea4 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -24,6 +24,7 @@
 #include "fstext/trivial-factor-weight.h"
 #include "fstext/fst-test-utils.h"
 
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -38,7 +39,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     std::cout << "FST before determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> ofst;
     try {
@@ -46,7 +47,7 @@ template<class Arc> void TestDeterminizeGeneral() {
       std::cout << "FST after determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
@@ -101,7 +102,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -109,7 +110,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -122,7 +123,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -138,7 +139,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   VectorFst<Arc> ofst_orig;
   VectorFst<Arc> ofst_star;
@@ -157,14 +158,14 @@ template<class Arc>  void TestDeterminize() {
   {
     std::cout <<" printing after determinization [baseline]\n";
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
   {
     std::cout <<" printing after determinization [star]\n";
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
@@ -174,7 +175,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
@@ -242,7 +243,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -250,7 +251,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -267,7 +268,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing after pushing\n";
   {
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   assert(RandEquivalent(*fst, fst_pushed, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -320,7 +321,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -328,7 +329,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -341,7 +342,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -357,7 +358,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   VectorFst<Arc> ofst_orig;
   VectorFst<Arc> ofst_star;
@@ -370,7 +371,7 @@ template<class Arc>  void TestMinimize() {
   {
     std::cout <<" printing after determinization [baseline]\n";
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -382,7 +383,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -392,7 +393,8 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after pushing weights [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -401,7 +403,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after  minimization [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     printf("Converting gallic back to regular [my approach]\n");
@@ -410,7 +412,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing factor-weight FST\n";
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
@@ -418,7 +420,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after converting back to regular FST\n";
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
   }
@@ -431,7 +433,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index 9f13b8b9695..d58dbfa539c 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -23,6 +23,7 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -79,7 +80,7 @@ template<class Arc> static void TestFactor() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(&fst);
@@ -87,7 +88,7 @@ template<class Arc> static void TestFactor() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   if (fst.Start() == kNoStateId) return;  // "Connect" made it empty.
diff --git a/src/fstext/fstext-lib.h b/src/fstext/fstext-lib.h
index bdb8ff730e5..03c8e5861dd 100644
--- a/src/fstext/fstext-lib.h
+++ b/src/fstext/fstext-lib.h
@@ -20,6 +20,9 @@
 #ifndef KALDI_FSTEXT_FSTEXT_LIB_H_
 #define KALDI_FSTEXT_FSTEXT_LIB_H_
 #include "fst/fstlib.h"
+
+#include "fstext/openfst_compat.h"
+
 #include "fstext/context-fst.h"
 #include "fstext/determinize-star.h"
 #include "fstext/factor.h"
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 148c64f3699..fb3a637bc19 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -163,7 +163,7 @@ void RemoveSomeInputSymbols(const std::vector<I> &to_remove,
                             MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   RemoveSomeInputSymbolsMapper<Arc, I> mapper(to_remove);
-  Map(fst, mapper);
+  ArcMap(fst, mapper);
 }
 
 template<class Arc, class I>
@@ -374,12 +374,25 @@ void GetSymbols(const SymbolTable &symtab,
                 std::vector<I> *syms_out) {
   KALDI_ASSERT(syms_out != NULL);
   syms_out->clear();
-  for (const SymbolTable::iterator::value_type &sym : symtab) {
-    if (include_eps || sym.Label() != 0) {
-      syms_out->push_back(sym.Label());
-      KALDI_ASSERT(syms_out->back() == sym.Label());  // an integer-range thing.
+#if OPENFST_VER >= 10800
+  for (SymbolTable::iterator iter = symtab.begin();
+      iter != symtab.end();
+      ++iter) {
+    if (include_eps || iter->Label() != 0) {
+      syms_out->push_back(iter->Label());
+      KALDI_ASSERT(syms_out->back() == iter->Label());  // an integer-range thing.
+    }
+  }
+#else
+  for (SymbolTableIterator iter(symtab);
+      !iter.Done();
+      iter.Next()) {
+    if (include_eps || iter.Value() != 0) {
+      syms_out->push_back(iter.Value());
+      KALDI_ASSERT(syms_out->back() == iter.Value());  // an integer-range thing.
     }
   }
+#endif
 }
 
 template<class Arc>
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 38ecc35dae2..460e49c7dec 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -23,6 +23,8 @@
 #include "util/stl-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst
 {
 using std::vector;
@@ -140,7 +142,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -148,7 +150,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -362,7 +364,7 @@ void TestEqualAlign() {
 template<class Arc> void Print(const Fst<Arc> &fst, std::string message) {
   std::cout << message << "\n";
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true, "\t");
-  fstprinter.Print(std::cout, "standard output");
+  printer_print(std::cout, fstprinter, "standard output");
 }
 
 
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 5789dbe7cc3..db14ddd3576 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -113,7 +113,7 @@ void PushInLog(VectorFst<StdArc> *fst, uint32 ptype, float delta = kDelta) {
 template<class Arc>
 void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
 
-  Map(fst, QuantizeMapper<Arc>(delta));
+  ArcMap(fst, QuantizeMapper<Arc>(delta));
   EncodeMapper<Arc> encoder(kEncodeLabels | kEncodeWeights, ENCODE);
   Encode(fst, &encoder);
   internal::AcceptorMinimize(fst);
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 01047919c22..3baa5b95c9c 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -24,6 +24,8 @@
 
 #include "util/text-utils.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 
 
@@ -44,7 +46,8 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     bool acceptor = false, write_one = false;
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
-    printer.Print(os, "<unknown>");
+    //printer.Print(&os, "<unknown>");
+    printer_print(os, printer, "<unknown>");
     if (os.fail())
       KALDI_ERR << "Stream failure detected writing FST to stream";
     // Write another newline as a terminating character.  The read routine will
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index a45920936ec..3c34f4b4787 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -26,6 +26,7 @@
 #include <fst/fst-decl.h>
 #include <fst/script/print-impl.h>
 #include "base/kaldi-common.h"
+#include "fstext/openfst_compat.h"
 
 // Some functions for writing Fsts.
 // I/O for FSTs is a bit of a mess, and not very well integrated with Kaldi's
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index c97a538dd1d..03ac9947c5c 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -268,9 +268,9 @@ void ConvertFstToLattice(
     MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
   int32 num_states_cache = 50000;
   fst::CacheOptions cache_opts(true, num_states_cache);
-  fst::MapFstOptions mapfst_opts(cache_opts);
+  fst::ArcMapFstOptions mapfst_opts(cache_opts);
   StdToLatticeMapper<Real> mapper;
-  MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
+  ArcMapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
          StdToLatticeMapper<Real> > map_fst(ifst, mapper, mapfst_opts);
   *ofst = map_fst;
 }
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index 8b596fcc356..6f1d2747cc1 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -21,6 +21,8 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 
 template<class Weight, class Int> void TestConvert(bool invert) {
@@ -31,7 +33,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -39,14 +41,14 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     std::cout << "FST after converting is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     std::cout << "FST after back conversion is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -67,7 +69,7 @@ template<class Weight, class Int> void TestShortestPath() {
       std::cout << "FST before converting to compact-arc is:\n";
       {
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       VectorFst<CompactArc> cfst;
       ConvertLattice<Weight, Int>(*fst, &cfst, false); // invert == false
@@ -205,7 +207,7 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
     /*std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -213,14 +215,14 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
     /*std::cout << "FST after converting is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     /*std::cout << "FST after back conversion is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
 
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -260,7 +262,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -268,7 +270,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST after converting and scaling is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
@@ -276,7 +278,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST after back conversion and scaling is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
       }*/
     // If RandEquivalent doesn't work, it could be due to a nasty issue related to the use
     // of exact floating-point comparisons in the Plus function of LatticeWeight.
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index ab33a7f243a..1458b1c9891 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -23,6 +23,7 @@
 
 #include "fst/fstlib.h"
 #include "base/kaldi-common.h"
+#include "fstext/openfst_compat.h"
 
 namespace fst {
 
@@ -438,11 +439,9 @@ class CompactLatticeWeightTpl {
   CompactLatticeWeightTpl(const WeightType &w, const std::vector<IntType> &s):
       weight_(w), string_(s) { }
 
-  CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl<WeightType, IntType> &w) {
-    weight_ = w.weight_;
-    string_ = w.string_;
-    return *this;
-  }
+  CompactLatticeWeightTpl(const CompactLatticeWeightTpl &compactLatticeWeightTpl) = default;
+
+  CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) = default;
 
   const W &Weight() const { return weight_; }
 
diff --git a/src/fstext/openfst_compat.h b/src/fstext/openfst_compat.h
new file mode 100644
index 00000000000..251d3f893c5
--- /dev/null
+++ b/src/fstext/openfst_compat.h
@@ -0,0 +1,42 @@
+#ifndef KALDI_FSTEXT_OPENFST_COMPAT_H
+#define KALDI_FSTEXT_OPENFST_COMPAT_H
+
+
+#if OPENFST_VER < 10800
+#define FST_FLAGS_fst_weight_separator FLAGS_fst_weight_separator
+#define FST_FLAGS_fst_field_separator FLAGS_fst_field_separator
+#define FST_FLAGS_v FLAGS_v
+
+#endif
+
+namespace fst {
+#if OPENFST_VER >= 10800
+
+
+template <typename... Args>
+auto Map(Args&&... args) -> decltype(ArcMap(std::forward<Args>(args)...)) {
+  return ArcMap(std::forward<Args>(args)...);
+}
+
+using MapFstOptions=ArcMapFstOptions;
+
+template <class A, class B, class C>
+using MapFst = ArcMapFst<A, B, C>;
+
+template<typename Printer, typename Stream>
+void printer_print(Stream &os, Printer &printer, const std::string &s) {
+  printer.Print(os, s);
+}
+
+#else
+
+template<typename Printer, typename Stream>
+void printer_print(Stream &os, Printer &printer, const std::string &s) {
+  printer.Print(&os, s);
+}
+
+#endif
+
+}  // namespace fst
+
+#endif  //KALDI_FSTEXT_OPENFST_COMPAT_H
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index 4ce58c97abc..45e1a82279a 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -235,9 +235,14 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix,
   assert(symTable != NULL);
   const char *prefix_ptr = prefix.c_str();
   size_t prefix_len = strlen(prefix_ptr);  // allowed to be zero but not encouraged.
-  for (const SymbolTable::iterator::value_type &symbol : *symTable) {
-    const char *sym = symbol.Symbol().c_str();
-    if (!strncmp(prefix_ptr, sym, prefix_len)) {  // has prefix.
+#if OPENFST_VER >= 10800
+  for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) {
+    const std::string &sym = siter->Symbol();
+#else
+  for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) {
+    const std::string &sym = siter.Symbol();
+#endif
+    if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) {  // has prefix.
       if (isdigit(sym[prefix_len])) {  // we don't allow prefix followed by a digit, as a symbol.
         // Has at least one digit.
         size_t pos;
@@ -411,8 +416,6 @@ void PreDeterminize(MutableFst<Arc> *fst,
   std::vector<bool> d_vec(max_state+1, false);  // "done vector".  Purely for debugging.
 
 
-  size_t num_extra_det_states = 0;
-
   // (D)(v)
   while (Q.size() != 0) {
 
@@ -491,7 +494,6 @@ void PreDeterminize(MutableFst<Arc> *fst,
                 assert(m_map.count(this_pr.first) == 0);
                 m_map[this_pr.first] = k;
                 k++;
-                num_extra_det_states++;
               }
             } else {  // Create the set V_t.
               V_t.insert(this_pr.second);
@@ -689,11 +691,9 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
   typedef typename Arc::Weight Weight;
   assert(fst != NULL);
   StateId num_states = fst->NumStates();
-  StateId num_final = 0;
   std::vector<StateId> final_states;
   for (StateId s = 0; s < num_states; s++) {
     if (fst->Final(s) != Weight::Zero()) {
-      num_final++;
       final_states.push_back(s);
     }
   }
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index 95ebd62f04f..60953e40b8d 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -22,8 +22,7 @@
 #include "fstext/fst-test-utils.h"
 #include "fstext/fstext-utils.h"
 
-// Just check that it compiles, for now.
-
+#include "fstext/openfst_compat.h"
 namespace fst
 {
   using std::vector;
@@ -73,7 +72,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -81,7 +80,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -95,7 +94,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -111,7 +110,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -121,14 +120,14 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after determinization\n";
   {
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   int64 num_removed = DeleteISymbols(&ofst, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
@@ -180,7 +179,7 @@ template<class Arc>  void TestAddSelfLoops() {
   std::cout <<" printing before adding self-loops\n";
   {
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -199,7 +198,7 @@ template<class Arc>  void TestAddSelfLoops() {
   std::cout <<" printing after adding self-loops\n";
   {
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   delete fst;
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index f27b54f4587..f91001fca0d 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -22,6 +22,8 @@
 #include "fstext/rand-fst.h"
 #include "fstext/fstext-utils.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 
 static void TestPruneSpecial() {
@@ -38,7 +40,7 @@ static void TestPruneSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     std::cout << std::endl;
   }
 
@@ -47,7 +49,7 @@ static void TestPruneSpecial() {
   PruneSpecial<StdArc>(*ifst, &ofst1, beam);
   {
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     std::cout << std::endl;
   }
 
@@ -56,7 +58,7 @@ static void TestPruneSpecial() {
   Prune(*ifst, &ofst2, beam);
   {
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
     std::cout << std::endl;
   }
 
diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc
index 9cf16bb8a84..9fe8ba63b59 100644
--- a/src/fstext/push-special-test.cc
+++ b/src/fstext/push-special-test.cc
@@ -23,6 +23,8 @@
 #include "fstext/fstext-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst
 {
 
@@ -38,7 +40,7 @@ static void TestPushSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> fst_copy(*fst);
@@ -56,7 +58,7 @@ static void TestPushSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value();
 
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index 2e1d3d8cfa1..1548ac5c726 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -23,6 +23,7 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -83,7 +84,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<Arc> fst_copy1(fst);
@@ -96,7 +97,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
   {
     std::cout << "copy1 = \n";
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -141,7 +142,7 @@ static void TestRemoveEpsLocalSpecial() {
   {
     std::cout << "logfst = \n";
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   VectorFst<StdArc> fst;
@@ -156,7 +157,7 @@ static void TestRemoveEpsLocalSpecial() {
   {
     std::cout << "logfst2 = \n";
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) {
     // make sure we preserved stochasticity in cases where doing so was
diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc
index 0e8982720d4..1cc8bd02bef 100644
--- a/src/fstext/table-matcher-test.cc
+++ b/src/fstext/table-matcher-test.cc
@@ -21,6 +21,8 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst{
 
 
@@ -64,13 +66,13 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
   std::cout <<"Table-Composed FST\n";
   {
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   std::cout <<" Baseline-Composed FST\n";
   {
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   if ( !RandEquivalent(composed, composed_baseline, 3/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 20/*path length-- max?*/)) {
@@ -79,7 +81,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     std::cout <<" Diff1 (composed - baseline) \n";
     {
       FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -88,7 +90,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     std::cout <<" Diff2 (baseline - composed) \n";
     {
       FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     assert(0);
@@ -149,7 +151,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       std::cout <<" Diff1 (composed - baseline) \n";
       {
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
 
@@ -158,7 +160,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       std::cout <<" Diff2 (baseline - composed) \n";
       {
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       assert(0);
@@ -219,7 +221,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       std::cout <<" Diff1 (composed - baseline) \n";
       {
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
 
@@ -228,7 +230,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       std::cout <<" Diff2 (baseline - composed) \n";
       {
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       assert(0);
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 290a4f8bc2e..9e921920c48 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -22,7 +22,7 @@
 #include <fst/fstlib.h>
 #include <fst/fst-decl.h>
 
-
+#include "base/kaldi-types.h"
 
 namespace fst {
 
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index 3045a669362..556d194a60d 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -22,7 +22,8 @@
 #include "fstext/determinize-star.h"
 #include "fstext/trivial-factor-weight.h"
 #include "fstext/fst-test-utils.h"
-// Just check that it compiles, for now.
+
+#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -73,7 +74,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -81,7 +82,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
   vector<Label> extra_syms;
@@ -92,7 +93,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
 
 
@@ -108,7 +109,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after double-epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    fstprinter.Print(std::cout, "standard output");
+    printer_print(std::cout, fstprinter, "standard output");
   }
   VectorFst<Arc> ofst_star;
 
@@ -127,7 +128,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing gallic FST\n";
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
@@ -139,7 +140,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing factor-weight FST\n";
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
@@ -147,7 +148,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing after converting back to regular FST\n";
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
 
 
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index 044e83b1f96..a8108e6b15c 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -390,7 +390,11 @@ class ArcIterator< TrivialFactorWeightFst<A, F> >
 template <class A, class F>
 inline void TrivialFactorWeightFst<A, F>::InitStateIterator(
     StateIteratorData<A> *data) const {
+#if OPENFST_VER >= 10803
+  data->base.reset(new StateIterator< TrivialFactorWeightFst<A, F> >(*this));
+#else
   data->base = new StateIterator< TrivialFactorWeightFst<A, F> >(*this);
+#endif
 }
 
 
diff --git a/src/gmm/mle-diag-gmm-test.cc b/src/gmm/mle-diag-gmm-test.cc
index d1af7725d20..a91832cd254 100644
--- a/src/gmm/mle-diag-gmm-test.cc
+++ b/src/gmm/mle-diag-gmm-test.cc
@@ -139,12 +139,10 @@ void test_flags_driven_update(const DiagGmm &gmm,
 
   // now both models gmm_all_update, gmm_all_update have the same params updated
   // compute loglike for models for check
-  double loglike0 = 0.0;
   double loglike1 = 0.0;
   double loglike2 = 0.0;
   for (int32 i = 0; i < feats.NumRows(); i++) {
-    loglike0 += static_cast<double>(
-      gmm.LogLikelihood(feats.Row(i)));
+    gmm.LogLikelihood(feats.Row(i));
     loglike1 += static_cast<double>(
       gmm_all_update.LogLikelihood(feats.Row(i)));
     loglike2 += static_cast<double>(
@@ -366,9 +364,8 @@ UnitTestEstimateDiagGmm() {
     est_gmm.Resize(gmm->NumGauss(),
       gmm->Dim(), flags_all);
     est_gmm.SetZero(flags_all);
-    float loglike = 0.0;
     for (size_t i = 0; i < counter; i++) {
-      loglike += est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F);
+      est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F);
     }
     test_io(*gmm, est_gmm, false, feats);  // ASCII mode
     test_io(*gmm, est_gmm, true, feats);   // Binary mode
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index d41d36489bf..3763943a89b 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -93,7 +93,7 @@ struct MapDiagGmmOptions {
   void Register(OptionsItf *opts) {
     opts->Register("mean-tau", &mean_tau,
                    "Tau value for updating means.");
-    opts->Register("variance-tau", &mean_tau,
+    opts->Register("variance-tau", &variance_tau,
                    "Tau value for updating variances (note: only relevant if "
                    "update-flags contains \"v\".");
     opts->Register("weight-tau", &weight_tau,
diff --git a/src/gmm/mle-full-gmm-test.cc b/src/gmm/mle-full-gmm-test.cc
index 472db88d501..26c5460f024 100644
--- a/src/gmm/mle-full-gmm-test.cc
+++ b/src/gmm/mle-full-gmm-test.cc
@@ -200,12 +200,10 @@ void test_flags_driven_update(const FullGmm &gmm,
 
   // now both models gmm_all_update, gmm_all_update have the same params updated
   // compute loglike for models for check
-  double loglike0 = 0.0;
   double loglike1 = 0.0;
   double loglike2 = 0.0;
   for (int32 i = 0; i < feats.NumRows(); i++) {
-    loglike0 += static_cast<double>(
-      gmm.LogLikelihood(feats.Row(i)));
+    gmm.LogLikelihood(feats.Row(i));
     loglike1 += static_cast<double>(
       gmm_all_update.LogLikelihood(feats.Row(i)));
     loglike2 += static_cast<double>(
@@ -462,9 +460,8 @@ UnitTestEstimateFullGmm() {
     est_gmm.Resize(gmm->NumGauss(),
       gmm->Dim(), flags_all);
     est_gmm.SetZero(flags_all);
-    float loglike = 0.0;
     for (int32 i = 0; i < counter; i++) {
-      loglike += est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F);
+      est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F);
     }
     test_io(*gmm, est_gmm, false, feats);
     test_io(*gmm, est_gmm, true, feats);
diff --git a/src/gmmbin/gmm-acc-mllt-global.cc b/src/gmmbin/gmm-acc-mllt-global.cc
index bed91c053d3..b6b7a2b5635 100644
--- a/src/gmmbin/gmm-acc-mllt-global.cc
+++ b/src/gmmbin/gmm-acc-mllt-global.cc
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
     
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       const Matrix<BaseFloat> &mat = feature_reader.Value();      
@@ -88,7 +88,6 @@ int main(int argc, char *argv[]) {
       } else {
         if (!gselect_reader.HasKey(utt)) {
           KALDI_WARN << "No gselect information for utterance " << utt;
-          num_err++;
           continue;
         }
         const std::vector<std::vector<int32> > &gselect= gselect_reader.Value(utt);
@@ -96,7 +95,6 @@ int main(int argc, char *argv[]) {
           KALDI_WARN << "Gselect information has wrong size for utterance "
                      << utt << ", " << gselect.size() << " vs. "
                      << mat.NumRows();
-          num_err++;
           continue;
         }
         
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
new file mode 100644
index 00000000000..459372e68b8
--- /dev/null
+++ b/src/hip/hipify.h
@@ -0,0 +1,283 @@
+#ifndef __HIPIFY_H__
+#define __HIPIFY_H__
+
+#ifdef __HIPCC__
+inline __device__ void __syncwarp(unsigned mask = 0xffffffff) {
+  // On CDNA hardware wave-fronts (warps) execute always in
+  // lock step. Though it might still be important to signal
+  // that the compiler can't reorder code around certain code
+  // sections that rely on data sharing mecanisms like LDS
+  // (shared memory). So this implements a No-op but is seen
+  // by the compiler as having side effects.
+  __asm__("s_nop 0");
+
+  // A saffest option, arguably less performant would be to use:
+  // __asm__("s_waitcnt lgkmcnt(0)"); Í
+  // to explicitly do a memory fence.
+}
+// AMDGCN only support this rounding mode.
+#define __fdiv_rd __fdiv_rn
+#else
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+//
+// HIP types
+//
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F \
+  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
+                 // replacements for the tensor cores alternative.
+#define CUBLAS_COMPUTE_32F_FAST_TF32 \
+  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
+                 // replacements for the tensor cores alternative.
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
+#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP \
+  HIPBLAS_GEMM_DEFAULT  // TODO: Verify regular GEMMs are viable replacements
+                        // for explicit tensor GEMMs.
+#define CUBLAS_OP_C HIPBLAS_OP_C
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_R_32F HIPBLAS_R_32F
+#define CUBLAS_R_64F HIPBLAS_R_64F
+#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUDA_R_32F HIP_R_32F
+#define CUDA_R_64F HIP_R_64F
+#define CUFFT_R2C HIPFFT_R2C
+#define CUFFT_SUCCESS HIPFFT_SUCCESS
+#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED \
+  HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
+#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
+#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
+#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
+#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
+#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
+#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
+#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
+#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
+#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES \
+  HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
+#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
+#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
+#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
+#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED \
+  HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
+#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
+#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
+#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
+#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
+#define cuDeviceGetName hipDeviceGetName
+#define cuMemGetInfo_v2 hipMemGetInfo
+#define cublasComputeType_t hipblasDatatype_t
+#define cublasCreate hipblasCreate
+#define cublasDasum_v2 hipblasDasum
+#define cublasDaxpy_v2 hipblasDaxpy
+#define cublasDcopy_v2 hipblasDcopy
+#define cublasDdot_v2 hipblasDdot
+#define cublasDestroy hipblasDestroy
+#define cublasDgemmBatched hipblasDgemmBatched
+#define cublasDgemm_v2 hipblasDgemm
+#define cublasDgemv_v2 hipblasDgemv
+#define cublasDger_v2 hipblasDger
+#define cublasDnrm2_v2 hipblasDnrm2
+#define cublasDscal_v2 hipblasDscal
+#define cublasDspmv_v2 hipblasDspmv
+#define cublasDspr_v2 hipblasDspr
+#define cublasDsyrk_v2 hipblasDsyrk
+#define cublasDtpmv_v2 hipblasDtpmv
+#define cublasDtrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
+  hipblasDtrsm(a, b, c, d, e, f, g, h, const_cast<double*>(i), j, k, l)
+#define cublasFillMode_t hipblasFillMode_t
+#define cublasGemmAlgo_t hipblasGemmAlgo_t
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasOperation_t hipblasOperation_t
+#define cublasSasum_v2 hipblasSasum
+#define cublasSaxpy_v2 hipblasSaxpy
+#define cublasScopy_v2 hipblasScopy
+#define cublasSdot_v2 hipblasSdot
+#define cublasSetStream hipblasSetStream
+#define cublasSgemv_v2 hipblasSgemv
+#define cublasSger_v2 hipblasSger
+#define cublasSnrm2_v2 hipblasSnrm2
+#define cublasSscal_v2 hipblasSscal
+#define cublasSspmv_v2 hipblasSspmv
+#define cublasSspr_v2 hipblasSspr
+#define cublasSsyrk_v2 hipblasSsyrk
+#define cublasStatus_t hipblasStatus_t
+#define cublasStatus_t hipblasStatus_t
+#define cublasStpmv_v2 hipblasStpmv
+#define cublasStrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
+  hipblasStrsm(a, b, c, d, e, f, g, h, const_cast<float*>(i), j, k, l)
+#define cudaComputeModeExclusive hipComputeModeExclusive
+#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
+#define cudaDataType hipDataType
+#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceReset hipDeviceReset
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
+#define cudaErrorInvalidDevice hipErrorInvalidDevice
+#define cudaError_t hipError_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterDefault hipHostRegisterDefault
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipHostMalloc
+#define cudaMallocPitch hipMallocPitch
+#define cudaMemcpy hipMemcpy
+// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized
+// copies, which should be canceled by ROCm 5.7.1+. Then the following would
+// be sufficient:
+// #define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpy2DAsync(a, b, c, d, width, height, e, f)      \
+  [&]() -> hipError_t {                                         \
+    if (width && height)                                        \
+      return hipMemcpy2DAsync(a, b, c, d, width, height, e, f); \
+    return hipSuccess;                                          \
+  }()
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMemset2DAsync hipMemset2DAsync
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaProfilerStop hipProfilerStop
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread ((hipStream_t)2)
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cufftComplex hipfftComplex
+#define cufftDestroy hipfftDestroy
+#define cufftExecR2C hipfftExecR2C
+#define cufftHandle hipfftHandle
+#define cufftPlanMany hipfftPlanMany
+#define cufftSetStream hipfftSetStream
+#define curandCreateGenerator hiprandCreateGenerator
+#define curandDestroyGenerator hiprandDestroyGenerator
+#define curandGenerateNormal hiprandGenerateNormal
+#define curandGenerateNormalDouble hiprandGenerateNormalDouble
+#define curandGenerateUniform hiprandGenerateUniform
+#define curandGenerateUniformDouble hiprandGenerateUniformDouble
+#define curandGenerator_t hiprandGenerator_t
+#define curandSetGeneratorOffset hiprandSetGeneratorOffset
+#define curandSetGeneratorOrdering(x, y) \
+  0  // HIP does not support generator ordeing.
+#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
+#define curandSetStream hiprandSetStream
+#define curandStatus_t hiprandStatus_t
+#define cusolverDnCreate hipsolverDnCreate
+#define cusolverDnDestroy hipsolverDnDestroy
+#define cusolverDnHandle_t hipsolverDnHandle_t
+#define cusolverDnSetStream hipsolverDnSetStream
+#define cusolverDnSpotrf hipsolverDnSpotrf
+#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
+#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize
+#define cusolverDnSpotrs hipsolverDnSpotrs
+#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
+#define cusparseAction_t hipsparseAction_t
+#define cusparseCreate hipsparseCreate
+#define cusparseCreateCsr hipsparseCreateCsr
+#define cusparseCreateDnMat hipsparseCreateDnMat
+#define cusparseCreateMatDescr hipsparseCreateMatDescr
+#define cusparseDcsr2csc hipsparseDcsr2csc
+#define cusparseDestroy hipsparseDestroy
+#define cusparseDestroy hipsparseDestroy
+#define cusparseDestroyDnMat hipsparseDestroyDnMat
+#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
+#define cusparseDestroySpMat hipsparseDestroySpMat
+#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
+#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
+#define cusparseHandle_t hipsparseHandle_t
+#define cusparseIndexBase_t hipsparseIndexBase_t
+#define cusparseMatDescr_t hipsparseMatDescr_t
+#define cusparseOperation_t hipsparseOperation_t
+#define cusparseScsr2csc hipsparseScsr2csc
+#define cusparseSetStream hipsparseSetStream
+#define cusparseSpMM hipsparseSpMM
+#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
+#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
+#define cusparseStatus_t hipsparseStatus_t
+#define nvtxRangePop roctxRangePop
+#define nvtxRangePush roctxRangePush
+#define nvtxRangePushA roctxRangePushA
+//
+// HIPCUB namespace.
+//
+#define cub hipcub
+
+//
+// Callback qualifier
+//
+#define CUDART_CB
+
+//
+// Math constants
+//
+#define CUDART_INF HIP_INF
+#define CUDART_INF_F HIP_INF_F
+
+//
+// GPU static hardware characteristics.
+//
+#define GPU_WARP_SIZE 64
+#define GPU_MAX_THREADS_PER_BLOCK 1024
+#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
+#endif  //__HIPIFY_H__
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 860a979a0ce..bce0c84ad79 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -146,7 +146,7 @@ bool PosteriorHolder::Read(std::istream &is) {
   try {
     ReadPosterior(is, is_binary, &t_);
     return true;
-  } catch (std::exception &e) {
+  } catch (const std::exception &e) {
     KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
@@ -207,7 +207,7 @@ bool GaussPostHolder::Read(std::istream &is) {
       }
     }
     return true;
-  } catch (std::exception &e) {
+  } catch (const std::exception &e) {
     KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
diff --git a/src/ivector/ivector-extractor-test.cc b/src/ivector/ivector-extractor-test.cc
index cb08464fbe8..ffd5a2561cc 100644
--- a/src/ivector/ivector-extractor-test.cc
+++ b/src/ivector/ivector-extractor-test.cc
@@ -94,11 +94,10 @@ void TestIvectorExtraction(const IvectorExtractor &extractor,
       ivector_dim = extractor.IvectorDim();
   Posterior post(num_frames);
 
-  double tot_log_like = 0.0;
   for (int32 t = 0; t < num_frames; t++) {
     SubVector<BaseFloat> frame(feats, t);
     Vector<BaseFloat> posterior(fgmm.NumGauss(), kUndefined);
-    tot_log_like += fgmm.ComponentPosteriors(frame, &posterior);
+    fgmm.ComponentPosteriors(frame, &posterior);
     for (int32 i = 0; i < posterior.Dim(); i++)
       post[t].push_back(std::make_pair(i, posterior(i)));
   }
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index d1d71ce7a42..e6819562f82 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -75,7 +75,7 @@ bool ClusterLattice(CompactLattice *clat,
   unordered_map<StateId, std::vector<Interval> >::iterator iter;
   for (iter = head.begin(); iter != head.end(); ++iter) {
     // For this ilabel, sort all the arcs on time, from first to last.
-    sort(iter->second.begin(), iter->second.end(), CompareInterval);
+    std::sort(iter->second.begin(), iter->second.end(), CompareInterval);
     std::vector<Interval> tmp;
     tmp.push_back(iter->second[0]);
     for (int32 i = 1; i < iter->second.size(); i++) {
@@ -175,7 +175,7 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 
   // Now we map the CompactLattice to VectorFst<KwsProductArc>. We drop the
   // alignment information and only keep the negated log-probs
-  Map(clat, factor_transducer, CompactLatticeToKwsProductFstMapper());
+  ArcMap(clat, factor_transducer, CompactLatticeToKwsProductFstMapper());
 
   // Now do the weight pushing manually on the CompactLattice format. Note that
   // the alphas and betas in Kaldi are stored as the log-probs, not the negated
@@ -366,7 +366,7 @@ void MaybeDoSanityCheck(const KwsProductFst &product_transducer) {
   if (GetVerboseLevel() < 2) return;
   KwsLexicographicFst index_transducer;
 
-  Map(product_transducer,
+  ArcMap(product_transducer,
       &index_transducer,
       KwsProductFstToKwsLexicographicFstMapper());
 
diff --git a/src/kws/kws-functions2.cc b/src/kws/kws-functions2.cc
index 71f5583af19..9e610d2054e 100644
--- a/src/kws/kws-functions2.cc
+++ b/src/kws/kws-functions2.cc
@@ -92,7 +92,7 @@ void DoFactorMerging(KwsProductFst *factor_transducer,
 
   Decode(&dest_transducer, encoder);
 
-  Map(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
+  ArcMap(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
 }
 
 void DoFactorDisambiguation(KwsLexicographicFst *index_transducer) {
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index 8e2b2a84def..a68c8965588 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -25,6 +25,8 @@
 #include "fstext/kaldi-fst-io.h"
 #include "kws/kaldi-kws.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace kaldi {
 
 typedef KwsLexicographicArc Arc;
@@ -287,7 +289,6 @@ int main(int argc, char *argv[]) {
     ArcSort(&index, fst::ILabelCompare<KwsLexicographicArc>());
 
     int32 n_done = 0;
-    int32 n_fail = 0;
     for (; !keyword_reader.Done(); keyword_reader.Next()) {
       std::string key = keyword_reader.Key();
       VectorFst<StdArc> keyword = keyword_reader.Value();
@@ -336,7 +337,6 @@ int main(int argc, char *argv[]) {
         if (result_fst.Final(arc.nextstate) != Weight::One()) {
           KALDI_WARN << "The resulting FST does not have "
                      << "the expected structure for key " << key;
-          n_fail++;
           continue;
         }
 
diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h
index 5c0c6d3c416..39775ac8950 100644
--- a/src/lat/arctic-weight.h
+++ b/src/lat/arctic-weight.h
@@ -50,7 +50,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
 
   static const std::string &Type() {
     static const std::string type = std::string("arctic") +
-        FloatWeightTpl<T>::GetPrecisionString();
+        std::string(FloatWeightTpl<T>::GetPrecisionString());
     return type;
   }
 
diff --git a/src/lat/determinize-lattice-pruned-test.cc b/src/lat/determinize-lattice-pruned-test.cc
index e7c7977e8cf..1c16906b090 100644
--- a/src/lat/determinize-lattice-pruned-test.cc
+++ b/src/lat/determinize-lattice-pruned-test.cc
@@ -24,6 +24,8 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace fst {
 // Caution: these tests are not as generic as you might think from all the
 // templates in the code.  They are basically only valid for LatticeArc.
@@ -63,7 +65,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
@@ -76,7 +78,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       std::cout << "FST after lattice-determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       KALDI_ASSERT(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
       // OK, now determinize it a different way and check equivalence.
@@ -93,14 +95,14 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       std::cout << "Compact pruned FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
       ConvertLattice<Weight, Int>(det_fst, &compact_pruned_det_fst, false);
 
       std::cout << "Compact version of determinized FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
-        fstprinter.Print(std::cout, "standard output");
+        printer_print(std::cout, fstprinter, "standard output");
       }
 
       if (ans)
@@ -123,14 +125,14 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      fstprinter.Print(std::cout, "standard output");
+      printer_print(std::cout, fstprinter, "standard output");
     }
     delete fst;
   }
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index dbdd9af4645..ff3d65d57f3 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -1499,7 +1499,7 @@ bool DeterminizeLatticePhonePrunedWrapper(
   }
   ILabelCompare<kaldi::LatticeArc> ilabel_comp;
   ArcSort(ifst, ilabel_comp);
-  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
       trans_model, ifst, beam, ofst, opts);
   Connect(ofst);
   return ans;
@@ -1523,7 +1523,7 @@ bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     DeterminizeLatticePrunedOptions opts);
 
 template
-bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
     const kaldi::TransitionInformation &trans_model,
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
@@ -1531,7 +1531,7 @@ bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
     DeterminizeLatticePhonePrunedOptions opts);
 
 template
-bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
     const kaldi::TransitionInformation &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index 70fde5acfa0..49bb8b67459 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -22,8 +22,11 @@
 #include "lat/kaldi-lattice.h"
 #include "fst/script/print-impl.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace kaldi {
 
+
 /// Converts lattice types if necessary, deleting its input.
 template<class OrigWeightType>
 CompactLattice* ConvertToCompactLattice(fst::VectorFst<OrigWeightType> *ifst) {
@@ -78,7 +81,8 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
-    printer.Print(os, "<unknown>");
+    //printer.Print(&os, "<unknown>");
+    printer_print(os, printer, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
@@ -403,7 +407,8 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
-    printer.Print(os, "<unknown>");
+    //printer.Print(&os, "<unknown>");
+    printer_print(os, printer, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
diff --git a/src/lat/minimize-lattice.cc b/src/lat/minimize-lattice.cc
index ada90efadce..416f1e62e93 100644
--- a/src/lat/minimize-lattice.cc
+++ b/src/lat/minimize-lattice.cc
@@ -279,7 +279,7 @@ bool MinimizeCompactLattice(
 
 // Instantiate for CompactLattice type.
 template
-bool MinimizeCompactLattice<kaldi::LatticeWeight, kaldi::int32>(
+bool MinimizeCompactLattice<kaldi::LatticeWeight, int32>(
     MutableFst<kaldi::CompactLatticeArc> *clat, float delta);
   
 
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index c2e231d91b1..15915837d56 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -22,6 +22,8 @@
 #include "lat/push-lattice.h"
 #include "fstext/rand-fst.h"
 
+#include "fstext/openfst_compat.h"
+
 
 namespace kaldi {
 using namespace fst;
@@ -92,12 +94,12 @@ void TestPushCompactLatticeWeights() {
       {
         fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
                                                    NULL, true, true, "\t");
-        printer.Print(std::cerr, "<unknown>");
+        printer_print(std::cerr, printer, "<unknown>");
       }
       {
         fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
                                                    NULL, true, true, "\t");
-        printer.Print(std::cerr, "<unknown>");
+        printer_print(std::cerr, printer, "<unknown>");
       }
       KALDI_ERR << "Bad lattice being pushed.";
     }
diff --git a/src/lat/push-lattice.cc b/src/lat/push-lattice.cc
index f4eb322d002..38a990d74d3 100644
--- a/src/lat/push-lattice.cc
+++ b/src/lat/push-lattice.cc
@@ -280,11 +280,11 @@ bool PushCompactLatticeWeights(
 
 // Instantiate for CompactLattice.
 template
-bool PushCompactLatticeStrings<kaldi::LatticeWeight, kaldi::int32>(
+bool PushCompactLatticeStrings<kaldi::LatticeWeight, int32>(
    MutableFst<kaldi::CompactLatticeArc> *clat);
 
 template
-bool PushCompactLatticeWeights<kaldi::LatticeWeight, kaldi::int32>(
+bool PushCompactLatticeWeights<kaldi::LatticeWeight, int32>(
    MutableFst<kaldi::CompactLatticeArc> *clat);
 
 }  // namespace fst
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index b851bc3604c..03b384f93f1 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -325,7 +325,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   // paper (i.e. just one final state).
 
   // Topologically sort the lattice, if not already sorted.
-  kaldi::uint64 props = clat->Properties(fst::kFstProperties, false);
+  uint64 props = clat->Properties(fst::kFstProperties, false);
   if (!(props & fst::kTopSorted)) {
     if (fst::TopSort(clat) == false)
       KALDI_ERR << "Cycles detected in lattice.";
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index 5f2513131d7..f6ce790d51d 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -25,6 +25,8 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
+#include "fstext/openfst_compat.h"
+
 namespace kaldi {
 
  using std::string;
@@ -257,7 +259,7 @@ int main(int argc, char *argv[]) {
     }
 
     int32 n_done = 0, n_fail = 0;
-    int32 tot_correct = 0, tot_substitutions = 0,
+    int32 tot_substitutions = 0,
           tot_insertions = 0, tot_deletions = 0, tot_words = 0;
 
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
@@ -320,7 +322,6 @@ int main(int argc, char *argv[]) {
         KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
                   << " / " << num_words << ", " << insertions << " insertions, "
                   << deletions << " deletions, " << substitutions << " sub ]";
-        tot_correct += correct;
         tot_substitutions += substitutions;
         tot_insertions += insertions;
         tot_deletions += deletions;
diff --git a/src/latbin/lattice-prune.cc b/src/latbin/lattice-prune.cc
index 49399f748e4..d87f5ded28f 100644
--- a/src/latbin/lattice-prune.cc
+++ b/src/latbin/lattice-prune.cc
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
     CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
 
-    int32 n_done = 0, n_err = 0;
+    int32 n_done = 0;
     int64 n_arcs_in = 0, n_arcs_out = 0,
         n_states_in = 0, n_states_out = 0;
 
@@ -86,7 +86,6 @@ int main(int argc, char *argv[]) {
       CompactLattice pruned_clat(clat);
       if (!PruneLattice(beam, &pruned_clat)) {
         KALDI_WARN << "Error pruning lattice for utterance " << key;
-        n_err++;
       }
       int64 pruned_narcs = NumArcs(pruned_clat),          
           pruned_nstates = pruned_clat.NumStates();
diff --git a/src/latbin/lattice-to-mpe-post.cc b/src/latbin/lattice-to-mpe-post.cc
index 7961cc5c438..771399a32a4 100644
--- a/src/latbin/lattice-to-mpe-post.cc
+++ b/src/latbin/lattice-to-mpe-post.cc
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
     }
 
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     double total_lat_frame_acc = 0.0, lat_frame_acc;
     double total_time = 0, lat_time;
 
@@ -114,7 +114,6 @@ int main(int argc, char *argv[]) {
       
       if (!alignments_reader.HasKey(key)) {
         KALDI_WARN << "No alignment for utterance " << key;
-        num_err++;
       } else {
         const std::vector<int32> &alignment = alignments_reader.Value(key);
         Posterior post;
diff --git a/src/latbin/lattice-to-smbr-post.cc b/src/latbin/lattice-to-smbr-post.cc
index e2772316954..6b2861b395f 100644
--- a/src/latbin/lattice-to-smbr-post.cc
+++ b/src/latbin/lattice-to-smbr-post.cc
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
     }
 
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     double total_lat_frame_acc = 0.0, lat_frame_acc;
     double total_time = 0, lat_time;
 
@@ -115,7 +115,6 @@ int main(int argc, char *argv[]) {
       
       if (!alignments_reader.HasKey(key)) {
         KALDI_WARN << "No alignment for utterance " << key;
-        num_err++;
       } else {
         const std::vector<int32> &alignment = alignments_reader.Value(key);
         Posterior post;
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index edab38e0270..bc54636f59b 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -25,9 +25,10 @@ $(error Android build does not support compiling with $(CXX).
         Supported compilers: clang++)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
            -I$(OPENBLASINC) -I$(ANDROIDINC) -ftree-vectorize -mfloat-abi=softfp \
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 8785371b3e1..713d8982f9e 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,10 +5,10 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
+CXXFLAGS += -DHAVE_CUDA -DOPENFST_VER=$(OPENFSTVER) -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT) -I.. -isystem $(OPENFSTINC)
-CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA \
+CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA -DOPENFST_VER=$(OPENFSTVER)\
              -ccbin $(lastword $(CXX)) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
              -std=c++14 -DCUDA_API_PER_THREAD_DEFAULT_STREAM -lineinfo \
              --verbose -Wno-deprecated-gpu-targets
@@ -16,4 +16,4 @@ CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA \
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
 
-CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
+CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index f9e73b90a47..fe04a2f1898 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -10,9 +10,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CLAPACK -I../../tools/CLAPACK/ \
            -msse -msse2 -O -Wa,-mbig-obj \
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 58eedaec8ae..8d8bd47cf2c 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -10,9 +10,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/darwin_arm64.mk b/src/makefiles/darwin_arm64.mk
new file mode 100644
index 00000000000..fd571f1db72
--- /dev/null
+++ b/src/makefiles/darwin_arm64.mk
@@ -0,0 +1,37 @@
+# Darwin (macOS) configuration
+
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
+           -pthread \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+else ifeq ($(findstring GCC,$(COMPILER)),GCC)
+# Allow implicit conversions between vectors.
+CXXFLAGS += -flax-vector-conversions
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
diff --git a/src/makefiles/darwin_clapack.mk b/src/makefiles/darwin_clapack.mk
index 9974fb4b959..983efbb77a0 100644
--- a/src/makefiles/darwin_clapack.mk
+++ b/src/makefiles/darwin_clapack.mk
@@ -17,9 +17,10 @@ CLAPACKLIBS = $(CLAPACKROOT)/CLAPACK-3.2.1/lapack.a $(CLAPACKROOT)/CLAPACK-3.2.1
 	      $(CLAPACKROOT)/CBLAS/lib/cblas.a \
 	      $(CLAPACKROOT)/f2c_BLAS-3.8.0/blas.a $(CLAPACKROOT)/libf2c/libf2c.a
 
-CXXFLAGS = -std=c++14 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 \
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 3ae5ed5e2dd..21a3b053639 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -145,12 +145,17 @@ ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
-ifeq ($(CUDA), true)
+ifeq ($(IS_GPU_BUILD), true)
 CUDA_SRCS=$(wildcard *.cu)
 # Check if any CUDA .cu sources exist to run dependency commands on.
 ifneq ($(CUDA_SRCS),)
+ifeq ($(CUDA), true)
 NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS)
 endif
+ifeq ($(ROCM), true)
+HIPCC_DEP_COMMAND = $(HIPCC) -M $(ROCM_FLAGS) $(ROCM_INCLUDE) $(CUDA_SRCS)
+endif
+endif
 endif
 
 .PHONY: depend
@@ -162,6 +167,9 @@ endif
 ifneq ($(NVCC_DEP_COMMAND),)
 	-$(NVCC_DEP_COMMAND) >> .depend.mk
 endif
+ifneq ($(HIPCC_DEP_COMMAND),)
+	-$(HIPCC_DEP_COMMAND) >> .depend.mk
+endif
 
 # removing automatic making of "depend" as it's quite slow.
 #.depend.mk: depend
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
new file mode 100644
index 00000000000..3d9b87dcc03
--- /dev/null
+++ b/src/makefiles/hip_64bit.mk
@@ -0,0 +1,53 @@
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef ROCMDIR
+$(error ROCMDIR not defined.)
+endif
+
+# Uncomment if willing to use ROCTX capabilities.
+# ROCM_USEROCTX = -DUSE_NVTX
+
+# Specific HIP/ROCm components should be included prior to the generic include to avoid
+# deprecation warnings.
+CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
+            -D__HIP_PLATFORM_AMD__=1 \
+            -D__IS_HIP_COMPILE__=1 \
+            -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
+            -DCUDA_VERSION=11000 \
+	          -I$(ROCMDIR)/hipsparse/include \
+	          -I$(ROCMDIR)/hipfft/include \
+	          -I$(ROCMDIR)/hipblas/include \
+	          -I$(ROCMDIR)/hiprand/include \
+	          -I$(ROCMDIR)/rocrand/include \
+	          -I$(ROCMDIR)/include \
+	          -I.. -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
+
+ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
+               -I$(ROCMDIR)/hipfft/include \
+               -I$(ROCMDIR)/hipblas/include \
+               -I$(ROCMDIR)/hiprand/include \
+               -I$(ROCMDIR)/rocrand/include \
+               -I$(ROCMDIR)/include \
+               -I.. -I../hip -isystem $(OPENFSTINC)
+               
+# TODO: Consider passing __CUDA_ARCH__=800 here as it is mostly supported by ROCm.
+#       However this macro has some side effect with HIPCC that makes it assume
+#       CUDA is active and everything is device compiles.
+ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
+             -D__IS_HIP_COMPILE__=1 \
+             -D__HIP_PLATFORM_AMD__=1 \
+             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
+             -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
+	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics  \
+             -fgpu-default-stream=per-thread \
+             $(EXTRA_ROCM_FLAGS)
+             
+
+# TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
+# We allow the libraries we link against to have undefined symbols so as this can be build in
+# systems with no development version of these libraries (e.g. ncurses).
+CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
+CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined 
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index d1443bee0b1..576b608b88b 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -19,9 +19,10 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index b8fc1ffe7e9..4e38ac20b66 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -16,9 +16,10 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index 778710b2466..fbd8bfad9f3 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -19,9 +19,10 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index bd6f611fc71..c1c47a1cf4d 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -17,9 +17,10 @@ CLAPACKLIBS = $(CLAPACKROOT)/CLAPACK-3.2.1/lapack.a $(CLAPACKROOT)/CLAPACK-3.2.1
 	      $(CLAPACKROOT)/CBLAS/lib/cblas.a \
 	      $(CLAPACKROOT)/f2c_BLAS-3.8.0/blas.a $(CLAPACKROOT)/libf2c/libf2c.a
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 \
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 21fed81b6d5..b22a81ce382 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -13,9 +13,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index bdcc0b9808d..5a123666b52 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/linux_openblas_aarch64.mk b/src/makefiles/linux_openblas_aarch64.mk
index 1895d289127..4a230ca555e 100644
--- a/src/makefiles/linux_openblas_aarch64.mk
+++ b/src/makefiles/linux_openblas_aarch64.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -pthread \
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 780e1df2e5a..9847913e5df 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 6ab9507fb93..21bb69b88df 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -19,9 +19,10 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 896428c184f..2b5fd3be654 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -17,9 +17,10 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++14 -I.. -isystem $(OPENFSTINC) -O1 \
+CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
+           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL $(MKL_CXXFLAGS) \
            -m64 -msse -msse2 -pthread -g
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index bc95c9189f6..b44bf1d934f 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -87,14 +87,14 @@ class MatrixBase {
   inline  Real* RowData(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
+    return data_ + static_cast<size_t>(i) * static_cast<size_t>(stride_);
   }
 
   /// Returns pointer to data for one row (const)
   inline const Real* RowData(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return data_ + i * stride_;
+    return data_ + static_cast<size_t>(i) * static_cast<size_t>(stride_);
   }
 
   /// Indexing operator, non-const
@@ -104,7 +104,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
+    return *(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);
   }
   /// Indexing operator, provided for ease of debugging (gdb doesn't work
   /// with parenthesis operator).
@@ -117,7 +117,7 @@ class MatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
-    return *(data_ + r * stride_ + c);
+    return *(data_ + static_cast<size_t>(r) * static_cast<size_t>(stride_) + c);
   }
 
   /*   Basic setting-to-special values functions. */
@@ -188,14 +188,14 @@ class MatrixBase {
   inline const SubVector<Real> Row(MatrixIndexT i) const {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   /// Return specific row of matrix.
   inline SubVector<Real> Row(MatrixIndexT i) {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
-    return SubVector<Real>(data_ + (i * stride_), NumCols());
+    return SubVector<Real>(data_ + (static_cast<size_t>(i) * static_cast<size_t>(stride_)), NumCols());
   }
 
   /// Return a sub-part of matrix.
@@ -1006,11 +1006,11 @@ class SubMatrix : public MatrixBase<Real> {
             MatrixIndexT num_cols,
             MatrixIndexT stride);
 
-  ~SubMatrix<Real>() {}
+  ~SubMatrix() {}
 
   /// This type of constructor is needed for Range() to work [in Matrix base
   /// class]. Cannot make it explicit.
-  SubMatrix<Real> (const SubMatrix &other):
+  SubMatrix(const SubMatrix &other):
   MatrixBase<Real> (other.data_, other.num_cols_, other.num_rows_,
                     other.stride_) {}
 
diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc
index 496c09f5344..6942b220da6 100644
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@@ -669,12 +669,10 @@ void ComputePca(const MatrixBase<Real> &X,
       Nsp.TopEigs(&l, &Vtmp);
     }
 
-    MatrixIndexT num_zeroed = 0;
     for (MatrixIndexT g = 0; g < G; g++) {
       if (l(g) < 0.0) {
         KALDI_WARN << "In PCA, setting element " << l(g) << " to zero.";
         l(g) = 0.0;
-        num_zeroed++;
       }
     }
     SortSvd(&l, &Vtmp); // Make sure zero elements are last, this
diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc
index 861dead05ba..db1b7359de9 100644
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@@ -57,7 +57,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
     if (max_x == 0.0) max_x = 1.0;
     s = 1.0 / max_x;
   }
-  
+
   Real sigma = 0.0;
   v[0] = 1.0;
   for (MatrixIndexT i = 1; i < dim; i++) {
@@ -73,7 +73,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
       v[0] = x1 - mu;
     } else {
       v[0] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));      
+      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));
     }
     Real v1 = v[0];
     Real v1sq = v1 * v1;
@@ -155,11 +155,11 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    with packed lower-triangular matrices to do it this way.  There's also
    a shift from one-based to zero-based indexing, so the index
    k is transformed k -> n - k, and a corresponding transpose...
-   
+
    Let the original *this be A.  This algorithms replaces *this with
    a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q.
    Caution: Q is transposed vs. Golub and Van Loan.
-   If Q != NULL it outputs Q. 
+   If Q != NULL it outputs Q.
 */
 template<typename Real>
 void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
@@ -195,7 +195,7 @@ void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
     if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this
       // case we apply them in the opposite order so it's H_n-1 .. H_1,
       // but also Q is transposed so we really have Q = H_1 .. H_n-1.
-      // It's a double negative.    
+      // It's a double negative.
       // Anyway, we left-multiply Q by each one.  The H_n would each be
       // diag(I + beta v v', I) but we don't ever touch the last dims.
       // We do (in Matlab notation):
@@ -309,7 +309,7 @@ void QrStep(MatrixIndexT n,
     if (k < n-2) {
       // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again
       // backwards.
-      Real &elem_kp2_k = z, 
+      Real &elem_kp2_k = z,
           &elem_kp2_kp1 = off_diag[k+1];
       // Note: elem_kp2_k == z would start off as zero because it's
        // two off the diagonal, and not been touched yet.  Therefore
@@ -338,7 +338,7 @@ void QrInternal(MatrixIndexT n,
   MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters.
       large_iters = 100 + 2*n;
   Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-  
+
   for (; counter < max_iters; counter++) { // this takes the place of "until
                                            // q=n"... we'll break out of the
                                            // loop when we converge.
@@ -356,7 +356,7 @@ void QrInternal(MatrixIndexT n,
         off_diag[i] = 0.0;
     }
     // The next code works out p, q, and npq which is n - p - q.
-    // For the definitions of q and p, see Golub and Van Loan; we 
+    // For the definitions of q and p, see Golub and Van Loan; we
     // partition the n dims into pieces of size (p, n-p-q, q) where
     // the part of size q is diagonal and the part of size n-p-p is
     // "unreduced", i.e. has no zero off-diagonal elements.
@@ -392,7 +392,7 @@ void QrInternal(MatrixIndexT n,
     } else {
       QrStep(npq, diag + p, off_diag + p,
              static_cast<MatrixBase<Real>*>(NULL));
-    }      
+    }
   }
   if (counter == max_iters) {
     KALDI_WARN << "Failure to converge in QR algorithm. "
@@ -490,7 +490,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     r.AddSpVec(1.0, S, Q.Row(d), 0.0);
     // r = S * q_d
     MatrixIndexT counter = 0;
-    Real end_prod;
+    Real end_prod = 0;
     while (1) { // Normally we'll do this loop only once:
       // we repeat to handle cases where r gets very much smaller
       // and we want to orthogonalize again.
@@ -528,11 +528,11 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     }
   }
 
-  Matrix<Real> R(lanczos_dim, lanczos_dim);  
+  Matrix<Real> R(lanczos_dim, lanczos_dim);
   R.SetUnit();
   T.Qr(&R); // Diagonalizes T.
   Vector<Real> s_tmp(lanczos_dim);
-  s_tmp.CopyDiagFromSp(T);  
+  s_tmp.CopyDiagFromSp(T);
 
   // Now T = R * diag(s_tmp) * R^T.
   // The next call sorts the elements of s from greatest to least absolute value,
@@ -544,7 +544,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
   SubMatrix<Real> Rsub(R, 0, eig_dim, 0, lanczos_dim);
   SubVector<Real> s_sub(s_tmp, 0, eig_dim);
   s->CopyFromVec(s_sub);
-      
+
   // For working out what to do now, just assume the other eigenvalues were
   // zero.  This is just for purposes of knowing how to get the result, and
   // not getting things wrongly transposed.
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
index 65c48097bf9..16d34160508 100644
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ b/src/nnet2/nnet-compute-discriminative.cc
@@ -296,7 +296,7 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
 
   ScalePosterior(eg_.weight, &post);
 
-  double tot_num_post = 0.0, tot_den_post = 0.0;
+  double tot_num_post = 0.0;
   std::vector<MatrixElement<BaseFloat> > sv_labels;
   sv_labels.reserve(answers.size());
   for (int32 t = 0; t < post.size(); t++) {
@@ -304,7 +304,6 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
       int32 pdf_id = post[t][i].first;
       BaseFloat weight = post[t][i].second;
       if (weight > 0.0) { tot_num_post += weight; }
-      else { tot_den_post -= weight; }
       MatrixElement<BaseFloat> elem = {t, pdf_id, weight};
       sv_labels.push_back(elem);
     }
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 0bf1bebe096..b6c75ac7118 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -3,9 +3,14 @@ all:
 
 include ../kaldi.mk
 
+ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
-
+endif
+ifeq ($(ROCM), true)
+LDFLAGS += $(ROCM_LDFLAGS)
+LDLIBS += $(ROCM_LDLIBS)
+endif
 
 TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-descriptor-test nnet-parse-test nnet-component-test \
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 0e07834ed3d..fd84c4e56fe 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -1503,7 +1503,7 @@ NnetBatchDecoder::~NnetBatchDecoder() {
   }
   // Print diagnostics.
 
-  kaldi::int64 input_frame_count =
+  int64 input_frame_count =
       frame_count_ * computer_->GetOptions().frame_subsampling_factor;
   int32 num_threads = static_cast<int32>(decode_threads_.size());
 
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index facbbb19be0..06278610553 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -673,11 +673,9 @@ void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) con
         vec.push_back(config_.num_frames[i]);
       if (j > 0)
         vec.push_back(config_.num_frames[j]);
-      int32 n = 0;
       while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
         if (!vec.empty()) // Don't allow the empty vector as a split.
           splits_set.insert(vec);
-        n++;
         vec.push_back(primary_length);
         std::sort(vec.begin(), vec.end());
       }
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 039fc258b13..2bd23273982 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -3,8 +3,14 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
+ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
+endif
+ifeq ($(ROCM), true)
+LDFLAGS += $(ROCM_LDFLAGS)
+LDLIBS += $(ROCM_LDLIBS)
+endif
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
diff --git a/src/online/online-tcp-source.cc b/src/online/online-tcp-source.cc
index 6d63493b4bd..8421073d559 100644
--- a/src/online/online-tcp-source.cc
+++ b/src/online/online-tcp-source.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-typedef kaldi::int32 int32;
+typedef int32 int32;
 
 OnlineTcpVectorSource::OnlineTcpVectorSource(int32 socket)
     : socket_desc(socket),
diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc
index 06157d0fcdf..615941f760a 100644
--- a/src/online2bin/apply-cmvn-online.cc
+++ b/src/online2bin/apply-cmvn-online.cc
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     
     
     BaseFloatMatrixWriter feature_writer(feature_wspecifier);
-    int32 num_done = 0, num_err = 0;
+    int32 num_done = 0;
     int64 tot_t = 0;
 
     if (spk2utt_rspecifier != "") {
@@ -82,7 +82,6 @@ int main(int argc, char *argv[]) {
           std::string utt = uttlist[i];
           if (!feature_reader.HasKey(utt)) {
             KALDI_WARN << "No features for utterance " << utt;
-            num_err++;
             continue;
           }
           const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc
index e697de6d15a..eafc0e64124 100644
--- a/src/online2bin/ivector-extract-online2.cc
+++ b/src/online2bin/ivector-extract-online2.cc
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(2),
         ivectors_wspecifier = po.GetArg(3);
 
-    double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
+    double tot_objf_impr = 0.0, tot_t = 0.0,
         tot_length = 0.0, tot_length_utt_end = 0.0;
     int32 num_done = 0, num_err = 0;
 
@@ -166,7 +166,6 @@ int main(int argc, char *argv[]) {
         }
         // Update diagnostics.
 
-        tot_ubm_loglike += T * ivector_feature.UbmLogLikePerFrame();
         tot_objf_impr += T * ivector_feature.ObjfImprPerFrame();
         tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0);
         for (int32 i = 0; i < num_ivectors; i++)
diff --git a/src/rnnlm/rnnlm-test-utils.cc b/src/rnnlm/rnnlm-test-utils.cc
index 32e8b5a4236..f415f257a06 100644
--- a/src/rnnlm/rnnlm-test-utils.cc
+++ b/src/rnnlm/rnnlm-test-utils.cc
@@ -78,7 +78,7 @@ void ConvertToInteger(
   for (int i = 0; i < string_sentences.size(); i++) {
     (*int_sentences)[i].resize(string_sentences[i].size());
     for (int j = 0; j < string_sentences[i].size(); j++) {
-      kaldi::int64 key = symbol_table.Find(string_sentences[i][j]);
+      int64 key = symbol_table.Find(string_sentences[i][j]);
       KALDI_ASSERT(key != -1); // fst::kNoSymbol
       (*int_sentences)[i][j] = static_cast<int32>(key);
     }
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 254d7ec36d8..cf88a408fcb 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -538,7 +538,6 @@ EventMap *SplitDecisionTree(const EventMap &input_map,
                             BaseFloat *obj_impr_out,
                             BaseFloat *smallest_split_change_out) {
   KALDI_ASSERT(num_leaves != NULL && *num_leaves > 0);  // can't be 0 or input_map would be empty.
-  int32 num_empty_leaves = 0;
   BaseFloat like_impr = 0.0;
   BaseFloat smallest_split_change = 1.0e+20;
   std::vector<DecisionTreeSplitter*> builders;
@@ -550,7 +549,6 @@ EventMap *SplitDecisionTree(const EventMap &input_map,
     builders.resize(split_stats.size());  // size == #leaves.
     for (size_t i = 0;i < split_stats.size();i++) {
       EventAnswerType leaf = static_cast<EventAnswerType>(i);
-      if (split_stats[i].size() == 0) num_empty_leaves++;
       builders[i] = new DecisionTreeSplitter(leaf, split_stats[i], q_opts);
     }
   }
diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index 534f3352def..9726b5343ee 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -675,7 +675,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
 
   for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {  // A check.
     if (summed_stats[i] != NULL &&
-        !binary_search(phones.begin(), phones.end(), i)) {
+        !std::binary_search(phones.begin(), phones.end(), i)) {
       KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
     }
   }
@@ -795,7 +795,7 @@ void KMeansClusterPhones(BuildTreeStatsType &stats,
   for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {
     // just a check.
     if (summed_stats[i] != NULL &&
-        !binary_search(phones.begin(), phones.end(), i)) {
+        !std::binary_search(phones.begin(), phones.end(), i)) {
       KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
     }
   }
diff --git a/src/tree/tree-renderer.cc b/src/tree/tree-renderer.cc
index bbaa5cda162..8e3b463fe7a 100644
--- a/src/tree/tree-renderer.cc
+++ b/src/tree/tree-renderer.cc
@@ -67,7 +67,7 @@ TreeRenderer::MakeEdgeLabel(const EventKeyType &key,
       oss << ", ";
     if (key != kPdfClass) {
       std::string phone =
-          phone_syms_.Find(static_cast<kaldi::int64>(*child));
+          phone_syms_.Find(static_cast<int64>(*child));
       if (phone.empty())
         KALDI_ERR << "No phone found for Phone ID " << *child;
       oss << phone;
@@ -137,7 +137,7 @@ void TreeRenderer::RenderTable(const EventType *query, int32 id) {
         ExpectToken(is_, binary_, "NULL"); // consume the invalid/NULL entry
         continue;
       }
-      std::string phone = phone_syms_.Find(static_cast<kaldi::int64>(t));
+      std::string phone = phone_syms_.Find(static_cast<int64>(t));
       if (phone.empty())
           KALDI_ERR << "Phone ID found in a TableEventMap, but not in the "
                     << "phone symbol table! ID: " << t;
diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc
index 358e33e686a..3613e44fc76 100644
--- a/src/util/kaldi-table-test.cc
+++ b/src/util/kaldi-table-test.cc
@@ -351,7 +351,8 @@ void UnitTestTableSequentialInt32(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -384,7 +385,8 @@ void UnitTestTableSequentialBool(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -418,7 +420,8 @@ void UnitTestTableSequentialDouble(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     KALDI_ASSERT(v2 == v);
@@ -462,7 +465,8 @@ void UnitTestTableSequentialDoubleBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     KALDI_ASSERT(v2 == v);
@@ -511,7 +515,8 @@ void UnitTestTableSequentialInt32VectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -551,7 +556,8 @@ void UnitTestTableSequentialInt32PairVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -594,7 +600,8 @@ void UnitTestTableSequentialInt32VectorVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -641,7 +648,8 @@ void UnitTestTableSequentialInt32Script(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
 
   unlink("tmp.scp");
   for (size_t i = 0; i < script.size(); i++) {
@@ -684,7 +692,8 @@ void UnitTestTableSequentialDoubleMatrixBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(new Matrix<double>(sbr.Value()));
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     for (size_t i = 0; i < v2.size(); i++)
@@ -738,7 +747,8 @@ void UnitTestTableSequentialBaseFloatVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(new Vector<BaseFloat>(sbr.Value()));
   }
-  KALDI_ASSERT(sbr.Close());
+  ans = sbr.Close();
+  KALDI_ASSERT(ans);
   KALDI_ASSERT(k2 == k);
   if (binary) {
     for (size_t i = 0; i < v2.size(); i++)
@@ -831,10 +841,11 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
       }
+      auto v2 = sbr.Value(cur_key);
       if (binary) {
-        KALDI_ASSERT(value == sbr.Value(cur_key));
+        KALDI_ASSERT(value == v2);
       } else {
-        KALDI_ASSERT(ApproxEqual(value, sbr.Value(cur_key)));
+        KALDI_ASSERT(ApproxEqual(value, v2));
       }
     }
   }
@@ -1039,10 +1050,11 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
       }
+      auto v2 = sbr.Value(cur_key);
       if (binary) {
-        KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 1.0e-10));
+        KALDI_ASSERT(value_ptr->ApproxEqual(v2, 1.0e-10));
       } else {
-        KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 0.01));
+        KALDI_ASSERT(value_ptr->ApproxEqual(v2, 0.01));
       }
     }
   }
diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc
index af1fcc00880..b242130b8c7 100644
--- a/src/util/parse-options-test.cc
+++ b/src/util/parse-options-test.cc
@@ -120,7 +120,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My boolean");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -144,7 +144,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My string");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -186,7 +186,7 @@ void UnitTestParseOptions() {
     po4.Read(argc4, argv4);
     KALDI_ASSERT(val == "bar");
   }
-  
+
   try {   // test error with --float=string
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option=foo"};
@@ -195,7 +195,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My float");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -208,7 +208,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -220,7 +220,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -232,7 +232,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected)xxx.";
   }
 
@@ -244,7 +244,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My bool");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -258,7 +258,7 @@ void UnitTestParseOptions() {
     po4.Register("num", &num, "My int32 variable");
     po4.Read(argc4, argv4);
     KALDI_ASSERT(num == 0);
-  } catch(std::exception e) {
+  } catch(const std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
diff --git a/src/util/stl-utils-test.cc b/src/util/stl-utils-test.cc
index 11781e2f938..3a54fc82bac 100644
--- a/src/util/stl-utils-test.cc
+++ b/src/util/stl-utils-test.cc
@@ -148,9 +148,9 @@ void TestCopyMapValuesToVector() {
     CopyMapValuesToVector(mp, &v);
     KALDI_ASSERT(mp.size() == v.size());
     int i = 0;
-    for (std::map<int, int>::iterator iter = mp.begin(); iter != mp.end();
-         iter++) {
-      KALDI_ASSERT(v[i++] == iter->second);
+    for (auto iter = mp.begin(); iter != mp.end();
+         iter++, i++) {
+      KALDI_ASSERT(v[i] == iter->second);
     }
   }
 }
diff --git a/tools/Makefile b/tools/Makefile
index df3b20321c3..a393e4f5a70 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,7 @@ CC ?= gcc        # used for sph2pipe
 
 WGET ?= wget
 
-OPENFST_VERSION ?= 1.8.1
+OPENFST_VERSION ?= 1.8.3
 CUB_VERSION ?= 1.8.0
 # No '?=', since there exists only one version of sph2pipe.
 SPH2PIPE_VERSION = 2.5
@@ -121,6 +121,8 @@ sclite sctk_made: sctk/.compiled
 
 sctk/.compiled: sctk
 	rm -f sctk/.compiled
+	sed -i -e '2s/^/#include <unistd.h>\n/' sctk/src/sclite/align.c
+	sed -i -e '99s/^/int TEXT_set_lang_prof(char *lprof);\n/' sctk/src/sclite/text.h
 	$(SCTK_MKENV) $(MAKE) -C sctk config
 	$(SCTK_MKENV) $(MAKE) -C sctk all doc
 	$(MAKE) -C sctk install
@@ -165,6 +167,11 @@ sph2pipe_v$(SPH2PIPE_VERSION)/Makefile: sph2pipe-$(SPH2PIPE_VERSION).tar.gz
 	rm -rf sph2pipe_v*
 	tar -xmzf sph2pipe-$(SPH2PIPE_VERSION).tar.gz
 	mv sph2pipe-$(SPH2PIPE_VERSION) sph2pipe_v$(SPH2PIPE_VERSION)
+	# Workaround for macOS bug <rdar://problem/19363342>
+	if [ `uname` = "Darwin" ]; then \
+		sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/sph2pipe.c ; \
+		sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/file_headers.c ; \
+	fi
 
 sph2pipe-$(SPH2PIPE_VERSION).tar.gz:
 	if [ -d "$(DOWNLOAD_DIR)" ]; then \
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 155a376b6e6..12504104b9a 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -82,61 +82,73 @@ if ! have libtoolize && ! have glibtoolize; then
   add_packages libtool
 fi
 
-if ! have svn; then
-  echo "$0: subversion is not installed"
-  add_packages subversion
-fi
-
 if ! have awk; then
   echo "$0: awk is not installed"
   add_packages gawk
 fi
 
-pythonok=true
+pythonok=false
+python3=false
+python27=false
+
+
 if ! have python2.7; then
   echo "$0: python2.7 is not installed"
-  add_packages python27 python2.7
-  pythonok=false
+else
+  echo "$0: python2.7 present"
+  python27=true
+  pythonok=true
 fi
 
 if ! have python3; then
   echo "$0: python3 is not installed"
   add_packages python3
-  pythonok=false
+else
+  echo "$0: python3 present"
+  python3=true
+  pythonok=true
 fi
 
 (
 #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script
 [ -f ./env.sh ] && . ./env.sh
-if $pythonok && ! have python2; then
-  mkdir -p $PWD/python
-  echo "$0: python2.7 is installed, but the python2 binary does not exist." \
-       "Creating a symlink and adding this to tools/env.sh"
-  ln -s $(command -v python2.7) $PWD/python/python2
-  echo "export PATH=$PWD/python:\${PATH}" >> env.sh
-fi
-
-if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then
-  rm $PWD/python/python
-fi
-
-if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then
-  version=$(python 2>&1 --version | awk '{print $2}')
-  if [[ $version != "2.7"* ]] ; then
-    echo "$0: WARNING python 2.7 is not the default python. We fixed this by" \
-         "adding a correct symlink more prominently on the path."
-    echo " ... If you really want to use python $version as default, add an" \
+rm -f $PWD/python/python*
+if ! [ -f $PWD/python/.use_default_python ]; then
+  echo "$0: Configuring python"
+  echo "$0: ... If you really want to avoid this, add an" \
          "empty file $PWD/python/.use_default_python and run this script again."
-    mkdir -p $PWD/python
+  if $python27 ; then
+    echo "$0: ... python2.7 found, making it default (python, python2, python2.7)"
     ln -s $(command -v python2.7) $PWD/python/python
-    echo "export PATH=$PWD/python:\${PATH}" >> env.sh
+    ln -s $(command -v python2.7) $PWD/python/python2
+    ln -s $(command -v python2.7) $PWD/python/python2.7
+  fi
+
+  if $python3 ; then
+    echo "$0: ... python3 found, making symlink (python3)"
+    ln -s $(command -v python3) $PWD/python/python3
+    if ! $python27 ; then
+      echo "$0: ... ... python2.7 not found, using python3 as python"
+      ln -s $(command -v python3) $PWD/python/python
+    fi
+  fi
+else
+  echo "$0: Not configuring python(s) -- using system defaults"
+  if ! have python ; then
+    echo "$0: WARNING: 'python' executable not present, configuring"
+    if $python27 ; then
+      ln -s $(command -v python2.7) $PWD/python/python
+    elif $python3 ; then
+      ln -s $(command -v python3) $PWD/python/python
+    fi
   fi
 fi
+
 )
 
 mathlib_missing=false
-case $(uname -m) in
-  x86_64)  # Suggest MKL on an Intel64 system (not supported on i?86 hosts).
+case "$(uname -m)-$(uname -s)" in
+  x86_64*)  # Suggest MKL on an Intel64 system (not supported on i?86 hosts).
     # Respect user-supplied MKL_ROOT environment variable.
     MKL_ROOT="${MKL_ROOT:-/opt/intel/mkl}"
        # Check the well-known mkl.h file location.
@@ -155,6 +167,9 @@ case $(uname -m) in
       mathlib_missing=true
     fi
       ;;
+  arm64-Darwin)  ## Apple Silicon
+    echo "$0: Relying on Acceleration framework"
+    ;;
   *)  # Suggest OpenBLAS on other hardware.
     if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] &&
          ! echo '#include <openblas_config.h>' |
diff --git a/tools/extras/install_cffi.sh b/tools/extras/install_cffi.sh
index dc7f91724a7..5ac9904173e 100755
--- a/tools/extras/install_cffi.sh
+++ b/tools/extras/install_cffi.sh
@@ -35,7 +35,7 @@ echo "**** Installing Cffi and dependencies"
 
 echo "Checking for Python-Dev"
 # copied from https://stackoverflow.com/questions/4848566/check-for-existence-of-python-dev-files-from-bash-script
-if [ ! -e $(python -c 'from distutils.sysconfig import get_makefile_filename as m; print m()') ]; then
+if [ ! -e $(python -c 'from sysconfig import get_makefile_filename as m; print m()') ]; then
     echo "On Debian/Ubuntu like system install by 'sudo apt-get python-dev' package."
     echo "On Fedora by 'yum install python-devel'"
     echo "On Mac OS X by 'brew install python'"
diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
index 8d6ae4ab7b4..a8b1c284879 100755
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@@ -8,7 +8,9 @@ if [ ! -f liblbfgs-$VER.tar.gz ]; then
   if [ -d "$DOWNLOAD_DIR" ]; then
     cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1
   else
-    $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
+    # only 1.10 supported
+    $WGET https://danielpovey.com/files/liblbfgs-$VER.tar.gz || exit 1
+    # $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
   fi
 fi
 
@@ -37,4 +39,3 @@ cd ..
   echo "export LIBLBFGS=$wd/liblbfgs-1.10"
   echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
 ) >> env.sh
-
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index 8c1899bdf2f..ddcd372a02c 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -16,7 +16,7 @@ default_package=intel-mkl-64bit-2020.0-088
 
 yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo'
 apt_repo='https://apt.repos.intel.com/mkl'
-intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB'
+intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB'
 
 Usage () {
   cat >&2 <<EOF
diff --git a/tools/extras/install_mmseg.sh b/tools/extras/install_mmseg.sh
index a76b98e2061..e6e17716718 100755
--- a/tools/extras/install_mmseg.sh
+++ b/tools/extras/install_mmseg.sh
@@ -16,13 +16,13 @@ fi
 
 
 # Install python-devel package if not already available
-# first, makes sure distutils.sysconfig usable
-if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+# first, makes sure sysconfig is usable
+if ! $(python -c "import sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
+  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index ce0fdf7fbdb..521d096adbd 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -19,18 +19,18 @@ fi
 
 tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
 
-rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
+rm -rf OpenMathLib-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
 
 if [ -d "$DOWNLOAD_DIR" ]; then
   cp -p "$DOWNLOAD_DIR/$tarball" .
 else
-  url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
+  url=$($WGET -qO- "https://api.github.com/repos/OpenMathLib/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python3 -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
   test -n "$url"
   $WGET -t3 -nv -O $tarball "$url"
 fi
 
 tar xzf $tarball
-mv xianyi-OpenBLAS-* OpenBLAS
+mv OpenMathLib-OpenBLAS-* OpenBLAS
 
 make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
 if [ $? -eq 0 ]; then
diff --git a/tools/extras/install_phonetisaurus.sh b/tools/extras/install_phonetisaurus.sh
index 8a07c5f5ca5..e407978972f 100755
--- a/tools/extras/install_phonetisaurus.sh
+++ b/tools/extras/install_phonetisaurus.sh
@@ -15,16 +15,16 @@ fi
   echo "You must call this script from the tools/ directory" && exit 1;
 
 # Install python-devel package if not already available
-# first, makes sure distutils.sysconfig usable
+# first, makes sure sysconfig is usable
 # We are not currently compiling the bindings by default, but it seems
 # worth it to keep this section as we do have them and they will
 # probably be used.
-if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+if ! $(python -c "import sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
+  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index b70e6cbb447..62b27e451ac 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -15,13 +15,13 @@ fi
   echo "You must call this script from the tools/ directory" && exit 1;
 
 # Install python-devel package if not already available
-# first, makes sure distutils.sysconfig usable
-if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+# first, makes sure sysconfig is usable
+if ! $(python -c "import sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
+  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 813109dbb80..1c69a8efb4e 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -16,30 +16,41 @@ fi
 ! command -v gawk > /dev/null && \
    echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
 
-if [ $# -ne 3 ]; then
-    echo "SRILM download requires some information about you"
-    echo
-    echo "Usage: $0 <name> <organization> <email>"
-    exit 1
-fi
-
-srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
-post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
-
-if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
-    echo 'There was a problem downloading the file.'
-    echo 'Check you internet connection and try again.'
-    exit 1
+if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ] && [ ! -d srilm ]; then
+  if [ $# -ne 4 ]; then
+      echo "SRILM download requires some information about you"
+      echo
+      echo "Usage: $0 <name> <organization> <email> <address>"
+      exit 1
+  fi
+
+  srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download2.php"
+  post_data="file=1.7.3&name=$1&org=$2&email=$3&address=$4&license=on"
+
+  if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
+      echo 'There was a problem downloading the file.'
+      echo 'Check your internet connection and try again.'
+      exit 1
+  fi
+
+  if [ ! -s srilm.tar.gz ]; then
+      echo 'The file is empty. There was a problem downloading the file.'
+      exit 1
+  fi
 fi
 
 mkdir -p srilm
 cd srilm
 
-
 if [ -f ../srilm.tgz ]; then
-    tar -xvzf ../srilm.tgz # Old SRILM format
-elif [  -f ../srilm.tar.gz ]; then
-    tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
+    tar -xvzf ../srilm.tgz || exit 1 # Old SRILM format
+elif [ -f ../srilm.tar.gz ]; then
+    tar -xvzf ../srilm.tar.gz || exit 1 # Changed format type from tgz to tar.gz
+fi
+
+if [ ! -f RELEASE ]; then
+    echo 'The file RELEASE does not exist. There was a problem extracting.'
+    exit 1
 fi
 
 major=`gawk -F. '{ print $1 }' RELEASE`