diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index c1f923cf58a..6b902ef54ad 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -16,9 +16,11 @@ jobs: CC: "ccache gcc" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install sox run: sudo apt-get install -y sox intel-mkl + - name: Install python2 + run: sudo apt-get install -y python2 - name: ccache uses: hendrikmuhs/ccache-action@v1.2 with: @@ -36,3 +38,9 @@ jobs: run: cd src && make -j 3 - name: make test run: cd src && make test + - name: upload logs if failure + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: fail-logs + path: ${{ github.workspace }}/src/**/*testlog diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml index d0bb01c5bf6..f63b761b5e2 100644 --- a/.github/workflows/docker-images.yml +++ b/.github/workflows/docker-images.yml @@ -3,6 +3,7 @@ name: Docker Image CI on: schedule: - cron: '37 2 * * 1' + workflow_dispatch: inputs: logLevel: @@ -14,17 +15,20 @@ on: - info - warning - debug + # pull_request: #for debugging purposes # branches: [ "master" ] jobs: + enable_build: - if: github.repository == 'kaldi-asr/kaldi' + #if: github.repository_owner == 'jtrmal' || github.repository_owner == 'kaldi-asr' + if: github.repository_owner == 'kaldi-asr' runs-on: ubuntu-latest outputs: enabled: ${{ steps.set-enabled.outputs.enabled }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set enabled @@ -38,22 +42,34 @@ jobs: enabled=true fi echo "enabled: $enabled" - echo "::set-output name=enabled::${enabled}" + echo "enabled=${enabled}" >> $GITHUB_OUTPUT - docker-buildx-gpu: + docker-buildx-gpu-12: needs: enable_build if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - name: Maximize build space + uses: AdityaGarg8/remove-unwanted-software@v4.1 + with: + remove-android: 'true' + remove-dotnet: 'true' + remove-haskell: 'true' + remove-codeql: 'true' + remove-docker-images: 'true' + remove-large-packages: 'true' + remove-cached-tools: 'true' + remove-swapfile: 'false' + verbose: 'true' + - uses: actions/checkout@v4 - name: Set up Docker Buildx id: buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 with: install: true - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -66,22 +82,34 @@ jobs: echo "Platforms: ${{ steps.buildx.outputs.platforms }}" - name: Build and push run: | - cd docker/ubuntu18.04-cuda10.0/ - docker build --push --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu18.04-cuda10.0 --tag kaldiasr/kaldi:gpu-ubuntu18.04-cuda10.0-$(date +%F) . + cd docker/ubuntu22.04-cuda12 + docker build --push --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12 --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12-$(date +%F) . - docker-buildx-cpu: + docker-buildx-gpu-cuda11: needs: enable_build if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - name: Maximize build space + uses: AdityaGarg8/remove-unwanted-software@v4.1 + with: + remove-android: 'true' + remove-dotnet: 'true' + remove-haskell: 'true' + remove-codeql: 'true' + remove-docker-images: 'true' + remove-large-packages: 'true' + remove-cached-tools: 'true' + remove-swapfile: 'false' + verbose: 'true' + - uses: actions/checkout@v4 - name: Set up Docker Buildx id: buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 with: install: true - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -94,10 +122,71 @@ jobs: echo "Platforms: ${{ steps.buildx.outputs.platforms }}" - name: Build and push run: | - cd docker/debian10-cpu/ + cd docker/ubuntu20.04-cuda11 + docker build --push --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11 --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11-$(date +%F) . + + docker-buildx-cpu-openblas: + needs: enable_build + if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + with: + install: true + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Inspect builder + run: | + echo "Name: ${{ steps.buildx.outputs.name }}" + echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}" + echo "Status: ${{ steps.buildx.outputs.status }}" + echo "Flags: ${{ steps.buildx.outputs.flags }}" + echo "Platforms: ${{ steps.buildx.outputs.platforms }}" + - name: Build and push + run: | + cd docker/debian12-cpu/ docker build --push \ --tag kaldiasr/kaldi:latest \ --tag kaldiasr/kaldi:cpu-latest \ - --tag kaldiasr/kaldi:cpu-debian10 \ - --tag kaldiasr/kaldi:cpu-debian10-$(date +%F) . + --tag kaldiasr/kaldi:cpu-latest-openblas \ + --tag kaldiasr/kaldi:cpu-debian12-openblas \ + --tag kaldiasr/kaldi:cpu-debian12-openblas-$(date +%F) . + docker-buildx-cpu-mkl: + needs: enable_build + if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + with: + install: true + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Inspect builder + run: | + echo "Name: ${{ steps.buildx.outputs.name }}" + echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}" + echo "Status: ${{ steps.buildx.outputs.status }}" + echo "Flags: ${{ steps.buildx.outputs.flags }}" + echo "Platforms: ${{ steps.buildx.outputs.platforms }}" + - name: Build and push + run: | + cd docker/debian12-cpu-mkl/ + docker build --push \ + --tag kaldiasr/kaldi:cpu-latest-mkl \ + --tag kaldiasr/kaldi:cpu-debian12-mkl \ + --tag kaldiasr/kaldi:cpu-debian12-mkl-$(date +%F) . + + diff --git a/COPYING b/COPYING index 5a5cab00a29..2b0dbd4243a 100644 --- a/COPYING +++ b/COPYING @@ -57,72 +57,72 @@ License v 2.0 are set forth below. Individual Contributors (in alphabetical order) - Mohit Agarwal - Tanel Alumae - Gilles Boulianne - Lukas Burget - Dogan Can - Guoguo Chen - Gaofeng Cheng + Albert Vernon + Alexander Solovets + Allen Guo + Ariya Rastrow + Arnab Ghoshal Cisco Corporation - Pavel Denisov - Ilya Edrenkin - Ewald Enzinger - Joachim Fainberg Daniel Galvez - Pegah Ghahremani - Arnab Ghoshal - Ondrej Glembek + Daniel Povey + Danijel Korzinek + David Snyder + Dogan Can + Eduardo Silva + Ewald Enzinger + Gaofeng Cheng + Gaurav Kumar + Georg Stemmer + Gilles Boulianne Go Vivace Inc. - Allen Guo - Hossein Hadian - Lv Hang - Mirko Hannemann + Guoguo Chen + Haihua Xu + Hainan Xu Hendy Irawan - Navdeep Jaitly + Hossein Hadian + Ilya Edrenkin + Jan "Yenda" Trmal + Jan Silovsky + Joachim Fainberg Johns Hopkins University - Shiyin Kang - Kirill Katsnelson - Tom Ko - Danijel Korzinek - Gaurav Kumar + Karel Vesely Ke Li + Kirill Katsnelson + Lucas Ondel + Lukas Burget + Lv Hang Matthew Maciejewski - Vimal Manohar - Yajie Miao Microsoft Corporation + Minhua Wu + Mirko Hannemann + Mohit Agarwal + Navdeep Jaitly + Nickolay V. Shmyrev + Omid Sadjadi + Ondrej Glembek + Ondrej Platek + Pavel Denisov + Pawel Swietojanski + Pegah Ghahremani + Peter Smit Petr Motlicek - Xingyu Na - Vincent Nguyen - Lucas Ondel - Vassil Panayotov - Vijayaditya Peddinti + Petr Schwarz Phonexia s.r.o. - Ondrej Platek - Daniel Povey - Yanmin Qian - Ariya Rastrow Saarland University - Omid Sadjadi - Petr Schwarz - Yiwen Shao - Nickolay V. Shmyrev - Jan Silovsky - Eduardo Silva - Peter Smit - David Snyder - Alexander Solovets - Georg Stemmer - Pawel Swietojanski - Jan "Yenda" Trmal - Albert Vernon - Karel Vesely - Yiming Wang Shinji Watanabe - Minhua Wu - Haihua Xu - Hainan Xu + Shiyin Kang + Tanel Alumae + Tom Ko + Vassil Panayotov + Vijayaditya Peddinti + Vimal Manohar + Vincent Nguyen Xiaohui Zhang + Xingyu Na + Yajie Miao + Yanmin Qian + Yiming Wang + Yiwen Shao Other Source Material diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py index 5925c6369a8..c8fee4c415f 100644 --- a/cmake/gen_cmake_skeleton.py +++ b/cmake/gen_cmake_skeleton.py @@ -269,7 +269,7 @@ def gen_code(self): if len(self.depends) > 0: ret.append("target_link_libraries(" + self.target_name + " PUBLIC") - for d in self.depends: + for d in self.depends + ['-lcblas', '-llapack']: ret.append(" " + d) ret.append(")\n") diff --git a/docker/debian10-cpu/Dockerfile b/docker/debian12-cpu-mkl/Dockerfile similarity index 52% rename from docker/debian10-cpu/Dockerfile rename to docker/debian12-cpu-mkl/Dockerfile index 05079922d03..aae82d24b93 100644 --- a/docker/debian10-cpu/Dockerfile +++ b/docker/debian12-cpu-mkl/Dockerfile @@ -1,9 +1,10 @@ -FROM debian:10 -LABEL maintainer="rick@scriptix.io" +FROM debian:12 +LABEL maintainer="jtrmal@apptek.com" RUN apt-get update && \ apt-get install -y --no-install-recommends \ g++ \ + gfortran \ make \ automake \ autoconf \ @@ -13,29 +14,21 @@ RUN apt-get update && \ sox \ libtool \ git \ - subversion \ - python2.7 \ python3 \ zlib1g-dev \ ca-certificates \ - gfortran \ patch \ - ffmpeg \ - vim && \ + python-is-python3 && \ rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/python3 /usr/bin/python RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL RUN cd /opt/kaldi/tools && \ - ./extras/install_mkl.sh && \ - make -j $(nproc) && \ + ./extras/install_mkl.sh && \ + make -j 5 && \ cd /opt/kaldi/src && \ ./configure --shared && \ make depend -j $(nproc) && \ - make -j $(nproc) && \ - find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \ - find /opt/intel -type f -name "*.a" -exec rm {} \; && \ - find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \ - rm -rf /opt/kaldi/.git + make -j 5 + WORKDIR /opt/kaldi/ diff --git a/docker/debian12-cpu/Dockerfile b/docker/debian12-cpu/Dockerfile new file mode 100644 index 00000000000..6c286d6ba24 --- /dev/null +++ b/docker/debian12-cpu/Dockerfile @@ -0,0 +1,34 @@ +FROM debian:12 +LABEL maintainer="jtrmal@apptek.com" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + g++ \ + gfortran \ + make \ + automake \ + autoconf \ + bzip2 \ + unzip \ + wget \ + sox \ + libtool \ + git \ + python3 \ + zlib1g-dev \ + ca-certificates \ + patch \ + python-is-python3 && \ + rm -rf /var/lib/apt/lists/* + + +RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL +RUN cd /opt/kaldi/tools && \ + ./extras/install_openblas.sh && \ + make -j 5 && \ + cd /opt/kaldi/src && \ + ./configure --shared --mathlib=OPENBLAS && \ + make depend -j $(nproc) && \ + make -j 5 + +WORKDIR /opt/kaldi/ diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile deleted file mode 100644 index ba694d1fb96..00000000000 --- a/docker/debian9.8-cpu/Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ - -FROM debian:9.8 -LABEL maintainer="mdoulaty@gmail.com" - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - g++ \ - make \ - automake \ - autoconf \ - bzip2 \ - unzip \ - wget \ - sox \ - libtool \ - git \ - subversion \ - python2.7 \ - python3 \ - zlib1g-dev \ - ca-certificates \ - gfortran \ - patch \ - ffmpeg \ - vim && \ - rm -rf /var/lib/apt/lists/* - -RUN ln -s /usr/bin/python2.7 /usr/bin/python - -RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ - cd /opt/kaldi/tools && \ - ./extras/install_mkl.sh && \ - make -j $(nproc) && \ - cd /opt/kaldi/src && \ - ./configure --shared && \ - make depend -j $(nproc) && \ - make -j $(nproc) && \ - find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \ - find /opt/intel -type f -name "*.a" -exec rm {} \; && \ - find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \ - rm -rf /opt/kaldi/.git -WORKDIR /opt/kaldi/ - diff --git a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0 b/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0 deleted file mode 100644 index 41fc78beb83..00000000000 --- a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0 +++ /dev/null @@ -1,44 +0,0 @@ - -FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 -LABEL maintainer="mdoulaty@gmail.com" - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - g++ \ - make \ - automake \ - autoconf \ - bzip2 \ - unzip \ - wget \ - sox \ - libtool \ - git \ - subversion \ - python2.7 \ - python3 \ - zlib1g-dev \ - gfortran \ - ca-certificates \ - patch \ - ffmpeg \ - vim && \ - rm -rf /var/lib/apt/lists/* - -RUN ln -s /usr/bin/python2.7 /usr/bin/python - -RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ - cd /opt/kaldi/tools && \ - ./extras/install_mkl.sh && \ - make -j $(nproc) && \ - cd /opt/kaldi/src && \ - ./configure --shared --use-cuda && \ - make depend -j $(nproc) && \ - make -j $(nproc) && \ - find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \ - find /opt/intel -type f -name "*.a" -exec rm {} \; && \ - find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \ - rm -rf /opt/kaldi/.git - -WORKDIR /opt/kaldi/ - diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu20.04-cuda11/Dockerfile similarity index 57% rename from docker/ubuntu16.04-gpu/Dockerfile rename to docker/ubuntu20.04-cuda11/Dockerfile index 41fc78beb83..81126cd96ac 100644 --- a/docker/ubuntu16.04-gpu/Dockerfile +++ b/docker/ubuntu20.04-cuda11/Dockerfile @@ -1,44 +1,40 @@ +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +LABEL maintainer="jtrmal@apptek.com" -FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 -LABEL maintainer="mdoulaty@gmail.com" - +ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y --no-install-recommends \ + apt-get install -yqq --no-install-recommends \ + build-essential \ g++ \ make \ automake \ - autoconf \ bzip2 \ unzip \ wget \ - sox \ libtool \ git \ - subversion \ - python2.7 \ python3 \ zlib1g-dev \ - gfortran \ ca-certificates \ + gfortran \ patch \ - ffmpeg \ - vim && \ + sox \ + software-properties-common && \ + apt-add-repository multiverse && \ + apt-get update && \ + yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\ + intel-mkl && \ rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/python2.7 /usr/bin/python RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ cd /opt/kaldi/tools && \ - ./extras/install_mkl.sh && \ make -j $(nproc) && \ cd /opt/kaldi/src && \ - ./configure --shared --use-cuda && \ + ./configure --shared --use-cuda=yes && \ make depend -j $(nproc) && \ make -j $(nproc) && \ find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \ - find /opt/intel -type f -name "*.a" -exec rm {} \; && \ - find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \ rm -rf /opt/kaldi/.git WORKDIR /opt/kaldi/ - diff --git a/docker/ubuntu18.04-cuda10.0/Dockerfile b/docker/ubuntu22.04-cuda12/Dockerfile similarity index 61% rename from docker/ubuntu18.04-cuda10.0/Dockerfile rename to docker/ubuntu22.04-cuda12/Dockerfile index 0c75863fedd..cb12b6abdd0 100644 --- a/docker/ubuntu18.04-cuda10.0/Dockerfile +++ b/docker/ubuntu22.04-cuda12/Dockerfile @@ -1,44 +1,39 @@ - -FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 -LABEL maintainer="mdoulaty@gmail.com" +FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 +LABEL maintainer="jtrmal@apptek.com" RUN apt-get update && \ apt-get install -y --no-install-recommends \ + build-essential \ g++ \ make \ automake \ - autoconf \ bzip2 \ unzip \ wget \ - sox \ libtool \ git \ - subversion \ - python2.7 \ python3 \ zlib1g-dev \ - gfortran \ ca-certificates \ + gfortran \ patch \ - ffmpeg \ - vim && \ + sox \ + software-properties-common && \ + apt-add-repository multiverse && \ + apt-get update && \ + yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\ + intel-mkl && \ rm -rf /var/lib/apt/lists/* -RUN ln -s /usr/bin/python2.7 /usr/bin/python RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \ cd /opt/kaldi/tools && \ - ./extras/install_mkl.sh && \ make -j $(nproc) && \ cd /opt/kaldi/src && \ - ./configure --shared --use-cuda && \ + ./configure --shared --use-cuda=yes && \ make depend -j $(nproc) && \ make -j $(nproc) && \ find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \ - find /opt/intel -type f -name "*.a" -exec rm {} \; && \ - find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \ rm -rf /opt/kaldi/.git WORKDIR /opt/kaldi/ - diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh index 0d40d25c23a..ed91a980791 100755 --- a/egs/ami/s5/run_ihm.sh +++ b/egs/ami/s5/run_ihm.sh @@ -17,7 +17,7 @@ set -euxo pipefail # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh index 4389c6b5d81..0cc76a56dd0 100755 --- a/egs/ami/s5/run_mdm.sh +++ b/egs/ami/s5/run_mdm.sh @@ -10,7 +10,7 @@ mic=mdm$nmics # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh index 17e2071f1f6..a212a8846b2 100755 --- a/egs/ami/s5/run_sdm.sh +++ b/egs/ami/s5/run_sdm.sh @@ -17,7 +17,7 @@ set -euxo pipefail # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5b/cmd.sh b/egs/ami/s5b/cmd.sh index b004c5569df..a8ea5d7c1ba 100644 --- a/egs/ami/s5b/cmd.sh +++ b/egs/ami/s5b/cmd.sh @@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G" # the use of cuda_cmd is deprecated, used only in 'nnet1', export cuda_cmd="queue.pl --gpu 1 --mem 20G" -if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then +if [[ "$(hostname -d)" == "fit.vutbr.cz" ]]; then queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1" diff --git a/egs/ami/s5b/conf/ami_beamformit.cfg b/egs/ami/s5b/conf/ami_beamformit.cfg new file mode 100644 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/ami/s5b/conf/ami_beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh index 79989f17004..94cd81f230b 100755 --- a/egs/ami/s5b/run.sh +++ b/egs/ami/s5b/run.sh @@ -28,7 +28,7 @@ set -euo pipefail # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac diff --git a/egs/ami/s5c/run.sh b/egs/ami/s5c/run.sh index cc4cd87610b..1281cad2e43 100755 --- a/egs/ami/s5c/run.sh +++ b/egs/ami/s5c/run.sh @@ -3,7 +3,7 @@ # Apache 2.0. # # This recipe performs diarization for the mix-headset data in the -# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 +# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 # corpus with simulated RIRs. We use oracle SAD in this recipe. # This recipe demonstrates the following: # 1. Diarization using x-vector and clustering (AHC, VBx, spectral) @@ -38,7 +38,7 @@ diarizer_type=spectral # must be one of (ahc, spectral, vbx) # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in - fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT, + fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora5/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, esac @@ -57,7 +57,7 @@ if [ $stage -le 1 ]; then local/ami_download.sh $mic $AMI_DIR fi -# Prepare data directories. +# Prepare data directories. if [ $stage -le 2 ]; then # Download the data split and references from BUT's AMI setup if ! [ -d AMI-diarization-setup ]; then @@ -120,7 +120,7 @@ if [ $stage -le 6 ]; then transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- |\ ivector-normalize-length ark:- ark:- |" \ $model_dir/xvectors_plda_train/plda || exit 1; - + cp $model_dir/xvectors_plda_train/plda $model_dir/ cp $model_dir/xvectors_plda_train/transform.mat $model_dir/ cp $model_dir/xvectors_plda_train/mean.vec $model_dir/ diff --git a/egs/gop_speechocean762/s5/local/visualize_feats.py b/egs/gop_speechocean762/s5/local/visualize_feats.py index 3b3ddaa037a..202c6a57b6b 100644 --- a/egs/gop_speechocean762/s5/local/visualize_feats.py +++ b/egs/gop_speechocean762/s5/local/visualize_feats.py @@ -8,6 +8,7 @@ import random import kaldi_io import seaborn as sns +import numpy as np from collections import Counter from sklearn.manifold import TSNE from utils import load_human_scores, load_phone_symbol_table @@ -62,6 +63,9 @@ def main(): min(args.samples, len(lables))) features, lables = list(zip(*sampled_paris)) + # Convert the tuple of arrays to a single 2D array + features = np.vstack(features) + # Draw scatters label_counter = Counter(lables) colors = sns.color_palette("colorblind", len(label_counter)) diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index ed4710d0b1f..051715f2b1e 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -54,7 +54,7 @@ function check_sorted { } for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames $utt_extra_files $spk_extra_files; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh index aa0163235a6..2a5750c9238 100755 --- a/egs/wsj/s5/utils/subword/prepare_subword_text.sh +++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh @@ -36,7 +36,7 @@ grep -q $separator $word_text && echo "$0: Error, word text file contains separa glossaries_opt= [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries" cut -d ' ' -f2- $word_text | \ - utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub + utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaries_opt > ${word_text}.sub if [ $word_text == $subword_text ]; then mv $word_text ${word_text}.old cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text diff --git a/egs/xbmu_amdo31/README.txt b/egs/xbmu_amdo31/README.txt new file mode 100644 index 00000000000..d2cda16fa58 --- /dev/null +++ b/egs/xbmu_amdo31/README.txt @@ -0,0 +1,11 @@ +About the XBMU-AMDO31 corpus XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University. + +XBMU-AMDO31 dataset is a speech recognition corpus of Tibetan Amdo dialect. The open source corpus contains 31 hours of speech data and resources related to build speech recognition systems,including transcribed texts and a Tibetan pronunciation lexicon. (The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused for the Amdo dialect because of the uniformity of the Tibetan language) The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR). + +The database can be downloaded from openslr: +http://www.openslr.org/133/ + +For more details, please visit: +https://huggingface.co/datasets/syzym/xbmu_amdo31 + +This recipe includes some different ASR models trained with XBMU-AMDO31. \ No newline at end of file diff --git a/egs/xbmu_amdo31/s5/RESULTS b/egs/xbmu_amdo31/s5/RESULTS new file mode 100644 index 00000000000..e50e43dc4db --- /dev/null +++ b/egs/xbmu_amdo31/s5/RESULTS @@ -0,0 +1,8 @@ +%WER 46.16 [ 15522 / 33628, 380 ins, 2208 del, 12934 sub ] exp/mono/decode_test/wer_10_0.0 +%WER 24.60 [ 8274 / 33628, 330 ins, 860 del, 7084 sub ] exp/tri1/decode_test/wer_13_0.0 +%WER 24.42 [ 8213 / 33628, 323 ins, 847 del, 7043 sub ] exp/tri2/decode_test/wer_13_0.0 +%WER 22.93 [ 7712 / 33628, 336 ins, 814 del, 6562 sub ] exp/tri3a/decode_test/wer_12_0.0 +%WER 20.17 [ 6783 / 33628, 275 ins, 764 del, 5744 sub ] exp/tri4a/decode_test/wer_15_0.0 +%WER 19.03 [ 6400 / 33628, 292 ins, 667 del, 5441 sub ] exp/tri5a/decode_test/wer_14_0.0 +%WER 15.45 [ 5196 / 33628, 229 ins, 646 del, 4321 sub ] exp/nnet3/tdnn_sp/decode_test/wer_16_0.0 +%WER 15.57 [ 5235 / 33628, 244 ins, 575 del, 4416 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_11_0.0 diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh new file mode 100644 index 00000000000..71dd849a93b --- /dev/null +++ b/egs/xbmu_amdo31/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/xbmu_amdo31/s5/conf/decode.config b/egs/xbmu_amdo31/s5/conf/decode.config new file mode 100644 index 00000000000..d91f86183af --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/decode.config @@ -0,0 +1,5 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + + + diff --git a/egs/xbmu_amdo31/s5/conf/mfcc.conf b/egs/xbmu_amdo31/s5/conf/mfcc.conf new file mode 100644 index 00000000000..a1aa3d6c158 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +--sample-frequency=16000 diff --git a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..ca067e77b37 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800) diff --git a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..591367e7ae9 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster. diff --git a/egs/xbmu_amdo31/s5/conf/online_pitch.conf b/egs/xbmu_amdo31/s5/conf/online_pitch.conf new file mode 100644 index 00000000000..c0f1342160d --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/online_pitch.conf @@ -0,0 +1,4 @@ +--sample-frequency=16000 +--simulate-first-pass-online=true +--normalization-right-context=25 +--frames-per-chunk=10 diff --git a/egs/xbmu_amdo31/s5/conf/pitch.conf b/egs/xbmu_amdo31/s5/conf/pitch.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/xbmu_amdo31/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..826aa163f2a --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +# This script is based on run_tdnn_7h.sh in swbd chain recipe. + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_1a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=1 +num_jobs_final=2 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $*" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 5 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done +fi + +exit; diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh new file mode 100755 index 00000000000..52d56adbc60 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash + +# This script is based on run_tdnn_1a.sh. +# This setup used online pitch to train the neural network. +# It requires a online_pitch.conf in the conf dir. + +set -e + +# configs for 'chain' +affix= +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_2a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=12 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $*" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=625 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires_online \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in dev test; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_$test_set \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 14 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +dir=${dir}_online +if [ $stage -le 15 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1; + done +fi + +if [ $stage -le 16 ]; then + for test_set in dev test; do + steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" --per-utt true \ + --config conf/decode.config \ + $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1; + done +fi + +exit; diff --git a/egs/xbmu_amdo31/s5/local/download_and_untar.sh b/egs/xbmu_amdo31/s5/local/download_and_untar.sh new file mode 100755 index 00000000000..9c70836bf46 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/download_and_untar.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_aishell, resource_aishell." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_aishell resource_aishell" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="15582913665 1246920" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +if [ $part == "data_aishell" ]; then + cd $data/$part/wav + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..610774fb2a2 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="dev test" +gmm=tri5a +online=false +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_sp_ali + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +online_affix= +if [ $online = true ]; then + online_affix=_online +fi + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \ + exp/make_mfcc/train_sp mfcc_perturbed || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp \ + exp/make_mfcc/train_sp mfcc_perturbed || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=mfcc_perturbed_hires$online_affix + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1; + # create MFCC data dir without pitch to extract iVector + utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=$(cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l) + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh new file mode 100755 index 00000000000..3f920315b77 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash + +# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh + +# In this script, the neural network in trained based on hires mfcc and online pitch. +# The online pitch setup requires a online_pitch.conf in the conf dir for both training +# and testing. + +set -e + +stage=0 +train_stage=-10 +affix= +common_egs_dir= + +# training options +initial_effective_lrate=0.0015 +final_effective_lrate=0.00015 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=12 +remove_egs=true + +# feature options +use_ivectors=true + +# End configuration section. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=43 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires_online \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + num_jobs=$(cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l) + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 10 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +if [ $stage -le 11 ]; then + # do the actual online decoding with iVectors, carrying info forward from + # previous utterances of the same speaker. + for decode_set in dev test; do + # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l) + decode_dir=${dir}_online/decode_$decode_set + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +if [ $stage -le 12 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev test; do + # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l` + num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l) + decode_dir=${dir}_online/decode_${decode_set}_per_utt + steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + --config conf/decode.config --per-utt true \ + $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1; + done +fi + +wait; +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/score.sh b/egs/xbmu_amdo31/s5/local/score.sh new file mode 100755 index 00000000000..d283ceb68dc --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/score.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +set -e -o pipefail +set -x +steps/score_kaldi.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + +echo "$0: Done" diff --git a/egs/xbmu_amdo31/s5/local/wer_hyp_filter b/egs/xbmu_amdo31/s5/local/wer_hyp_filter new file mode 100755 index 00000000000..c6660e4efe1 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/wer_hyp_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('',''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/xbmu_amdo31/s5/local/wer_output_filter b/egs/xbmu_amdo31/s5/local/wer_output_filter new file mode 100755 index 00000000000..aceeeec41b4 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +while (<>) { + @F = split " "; + print $F[0] . " "; + foreach $s (@F[1..$#F]) { + if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) { + print ""; + } else { + print "$s" + } + print " "; + } + print "\n"; +} + + diff --git a/egs/xbmu_amdo31/s5/local/wer_ref_filter b/egs/xbmu_amdo31/s5/local/wer_ref_filter new file mode 100755 index 00000000000..c6660e4efe1 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/wer_ref_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('',''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh new file mode 100755 index 00000000000..a3ba6fabaf4 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +# Copyright 2017 Xingyu Na +# 2021 Northwest Minzu University (senyan Li) +#Apache 2.0 + +. ./path.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 " + echo " $0 /export/data/xbmu_amdo31/data/wav /export/data/xbmu_amdo31/data/transcript" + exit 1; +fi + +tibetan_audio_dir=$1 +tibetan_text=$2/transcript_clean.txt + +train_dir=data/local/train +dev_dir=data/local/dev +test_dir=data/local/test +tmp_dir=data/local/tmp + +mkdir -p $train_dir +mkdir -p $dev_dir +mkdir -p $test_dir +mkdir -p $tmp_dir + +# data directory check +if [ ! -d $tibetan_audio_dir ] || [ ! -f $tibetan_text ]; then + echo "Error: $0 requires two directory arguments" + exit 1; +fi +echo $tibetan_audio_dir +# find wav audio file for train, dev and test resp. +find $tibetan_audio_dir -iname "*.wav" > $tmp_dir/wav.flist +n=$(wc -l < "$tmp_dir/wav.flist") +[ $n -ne 22630 ] && \ + echo Warning: expected 141925 data data files, found $n + +grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; +grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; +grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; + +rm -r $tmp_dir +# Transcriptions preparation +# cat $tibetan_text |head -10 +for dir in $train_dir $dev_dir $test_dir; do + echo Preparing $dir transcriptions + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all + rm -f $dir/transcripts1.txt + while read -r line + do + line1=$(echo "$line" | cut -d '-' -f 2) + line2=$(grep -w $line1 $tibetan_text |cut -d " " -f 2-) + text=$line" "$line2 + echo $text >>$dir/transcripts1.txt + done < "$dir/utt.list" + paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all + utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt + awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk + utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp + sort -u $dir/transcripts.txt > $dir/text + utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt +done + +mkdir -p data/train data/dev data/test + +for f in spk2utt utt2spk wav.scp text; do + cp $train_dir/$f data/train/$f || exit 1; + cp $dev_dir/$f data/dev/$f || exit 1; + cp $test_dir/$f data/test/$f || exit 1; +done + +echo "$0: tibetan data preparation succeeded" +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh new file mode 100755 index 00000000000..1e5537858ff --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +# prepare dict resources + +. ./path.sh + +[ $# != 1 ] && echo "Usage: $0 " && exit 1; + +res_dir=$1 +dict_dir=data/local/dict +mkdir -p $dict_dir +cp $res_dir/lexicon.txt $dict_dir + +cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ + perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil"); + m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; } + foreach $l (values %q) {print "$l\n";} + ' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1; + +echo sil > $dict_dir/silence_phones.txt + +echo sil > $dict_dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone + +cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1; +cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dict_dir/extra_questions.txt || exit 1; + +echo "$0: Tibetan dict preparation succeeded" +exit 0; diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh new file mode 100755 index 00000000000..658f0e7bc15 --- /dev/null +++ b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/train/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/train/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +kaldi_lm=$(command -v train_lm.sh) +if [ -z $kaldi_lm ]; then + echo "$0: train_lm.sh is not found. That might mean it's not installed" + echo "$0: or it is not added to PATH" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" + exit 1 +fi + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# LM is small enough that we don't need to prune it (only about 0.7M N-grams). +# Perplexity over 128254.000000 words is 90.446690 + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + +exit 0 + + +# From here is some commands to do a baseline with SRILM (assuming +# you have it installed). +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train + +cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist + + +ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 + +# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. +# Difference in WSJ must have been due to different treatment of . +ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh new file mode 100755 index 00000000000..b70ffbfbb26 --- /dev/null +++ b/egs/xbmu_amdo31/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=$(pwd)/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/xbmu_amdo31/s5/run.sh b/egs/xbmu_amdo31/s5/run.sh new file mode 100755 index 00000000000..61b3e8f62d8 --- /dev/null +++ b/egs/xbmu_amdo31/s5/run.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash + +# Copyright Copyright 2021 Northwest Minzu University (Authors: Senyan Li) +# 2017 Hui Bu +# 2017 Jiayu Du +# 2017 Xingyu Na +# 2017 Bengu Wu +# 2017 Hao Zheng +# Apache 2.0 + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. +# Caution: some of the graph creation steps use quite a bit of memory, so you +# should run this on a machine that has sufficient memory. + +# corpus directory and download URL +data=/home1/lsy/kaldi/egs/xbmu_amdo31/s5/export/data +data_url=www.openslr.org/resources/133 + +. ./cmd.sh + +#local/download_and_untar.sh $data $data_url xbmu-amdo31 || exit 1; + +# Lexicon Preparation, +local/xbmu_amdo31_prepare_dict.sh $data/xbmu_amdo31/resource || exit 1; + +# Data Preparation, +local/xbmu_amdo31_data_prep.sh $data/xbmu_amdo31/data/wav $data/xbmu_amdo31/data/transcript || exit 1; + +# Phone Sets, questions, L compilation +utils/prepare_lang.sh --position-dependent-phones false data/local/dict \ + "" data/local/lang data/lang || exit 1; + +# LM training +local/xbmu_amdo31_train_lms.sh || exit 1; + +# G compilation, check LG composition +utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \ + data/local/dict/lexicon.txt data/lang_test || exit 1; + +# Now make MFCC plus pitch features. +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc +for x in train dev test; do + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + utils/fix_data_dir.sh data/$x || exit 1; +done + +# Train a monophone model on delta features. +steps/train_mono.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/mono || exit 1; + +# Decode with the monophone model. +utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/mono/graph data/dev exp/mono/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/mono/graph data/test exp/mono/decode_test + +# Get alignments from monophone system. +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/mono exp/mono_ali || exit 1; + +# Train the first triphone pass model tri1 on delta + delta-delta features. +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; + +# decode tri1 +utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri1/graph data/dev exp/tri1/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri1/graph data/test exp/tri1/decode_test + +# align tri1 +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + +# train tri2 [delta+delta-deltas] +steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; + +# decode tri2 +utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri2/graph data/dev exp/tri2/decode_dev +steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \ + exp/tri2/graph data/test exp/tri2/decode_test + +# Align training data with the tri2 model. +steps/align_si.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri2 exp/tri2_ali || exit 1; + +# Train the second triphone pass model tri3a on LDA+MLLT features. +steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1; + +# Run a test decode with the tri3a model. +utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; +steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri3a/graph data/dev exp/tri3a/decode_dev +steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri3a/graph data/test exp/tri3a/decode_test + +# align tri3a with fMLLR + +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; + +# Train the third triphone pass model tri4a on LDA+MLLT+SAT features. +# From now on, we start building a more serious system with Speaker +# Adaptive Training (SAT). +steps/train_sat.sh --cmd "$train_cmd" \ + 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1; + +# decode tri4a +utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri4a/graph data/dev exp/tri4a/decode_dev +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri4a/graph data/test exp/tri4a/decode_test + +# align tri4a with fMLLR +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri4a exp/tri4a_ali + +# Train tri5a, which is LDA+MLLT+SAT +# Building a larger SAT system. You can see the num-leaves is 3500 and tot-gauss is 100000 + +steps/train_sat.sh --cmd "$train_cmd" \ + 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; + +# decode tri5a +utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1; +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1; +steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1; + +# align tri5a with fMLLR +steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \ + data/train data/lang exp/tri5a exp/tri5a_ali || exit 1; + +# nnet3 +local/nnet3/run_tdnn.sh + +# chain +local/chain/run_tdnn.sh + +# getting results (see RESULTS file) +for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null +for x in exp/*/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null + +exit 0; diff --git a/egs/xbmu_amdo31/s5/steps b/egs/xbmu_amdo31/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/xbmu_amdo31/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/xbmu_amdo31/s5/utils b/egs/xbmu_amdo31/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/xbmu_amdo31/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/src/Makefile b/src/Makefile index 4d4efbc0172..5036d12b707 100644 --- a/src/Makefile +++ b/src/Makefile @@ -34,6 +34,12 @@ SUBDIRS += $(CUDADECODER) endif endif +ifeq ($(ROCM), true) +ifeq ($(WITH_CUDADECODER), true) +SUBDIRS += $(CUDADECODER) +endif +endif + SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS)) SUBDIRS_BIN = $(filter %bin, $(SUBDIRS)) @@ -56,14 +62,16 @@ endif # Don't call rm -rf. rmlibdir: +ifeq ($(KALDI_FLAVOR), dynamic) ifneq ($(KALDILIBDIR), ) - -rm -f $(KALDILIBDIR)/*{.so,.a,.o} + -rm -f $(KALDILIBDIR)/*{.so,.a,.o,.dylib} -rmdir 2>/dev/null $(KALDILIBDIR); true else # KALDILIBDIR might have been unset because of reconfigure. Do a best guess. @echo "Something seems wrong. Please re-run configure." @echo "I will continue but the cleanup might not be complete." endif +endif kaldi.mk: @echo "ERROR: kaldi.mk does not exist; run ./configure first."; diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc index 31440edf3f9..68ef224b5f5 100644 --- a/src/base/kaldi-error-test.cc +++ b/src/base/kaldi-error-test.cc @@ -76,7 +76,7 @@ int main() { kaldi::UnitTestError(); KALDI_ASSERT(0); // should not happen. exit(1); - } catch (kaldi::KaldiFatalError &e) { + } catch (const kaldi::KaldiFatalError &e) { std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n"; } } diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h index a9904a752cd..572cbb4effd 100644 --- a/src/base/kaldi-error.h +++ b/src/base/kaldi-error.h @@ -185,12 +185,12 @@ class MessageLogger { #define KALDI_ASSERT(cond) \ do { \ if (cond) \ - (void)0; \ + (void)(cond); \ else \ ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \ } while (0) #else -#define KALDI_ASSERT(cond) (void)0 +#define KALDI_ASSERT(cond) (void)(cond) #endif // Some more expensive asserts only checked if this defined. @@ -198,12 +198,12 @@ class MessageLogger { #define KALDI_PARANOID_ASSERT(cond) \ do { \ if (cond) \ - (void)0; \ + (void)(cond); \ else \ ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \ } while (0) #else -#define KALDI_PARANOID_ASSERT(cond) (void)0 +#define KALDI_PARANOID_ASSERT(cond) (void)(cond) #endif /***** THIRD-PARTY LOG-HANDLER *****/ diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h index 7ebf4f85386..6d96ecf2b75 100644 --- a/src/base/kaldi-types.h +++ b/src/base/kaldi-types.h @@ -39,9 +39,21 @@ typedef float BaseFloat; // we find in the future lacks stdint.h #include -// for discussion on what to do if you need compile kaldi -// without OpenFST, see the bottom of this this file +#if OPENFST_VER >= 10800 +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef float float32; +typedef double double64; +#else #include +#endif namespace kaldi { using ::int16; @@ -53,23 +65,4 @@ namespace kaldi { typedef float float32; typedef double double64; } // end namespace kaldi - -// In a theoretical case you decide compile Kaldi without the OpenFST -// comment the previous namespace statement and uncomment the following -/* -namespace kaldi { - typedef int8_t int8; - typedef int16_t int16; - typedef int32_t int32; - typedef int64_t int64; - - typedef uint8_t uint8; - typedef uint16_t uint16; - typedef uint32_t uint32; - typedef uint64_t uint64; - typedef float float32; - typedef double double64; -} // end namespace kaldi -*/ - #endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc index 3c93dfd0d39..6aee0c5ce78 100644 --- a/src/bin/matrix-sum.cc +++ b/src/bin/matrix-sum.cc @@ -49,7 +49,7 @@ int32 TypeOneUsage(const ParseOptions &po, } int32 n_utts = 0, n_total_matrices = 0, - n_success = 0, n_missing = 0, n_other_errors = 0; + n_success = 0, n_missing = 0; for (; !matrix_reader1.Done(); matrix_reader1.Next()) { std::string key = matrix_reader1.Key(); @@ -78,7 +78,6 @@ int32 TypeOneUsage(const ParseOptions &po, << matrix_in_fns[i] << " vs " << matrix_out.NumRows() << " by " << matrix_out.NumCols() << " primary matrix, rspecifier:" << matrix_in_fn1; - n_other_errors++; } } else { KALDI_WARN << "No matrix found for utterance " << key << " for " @@ -124,7 +123,7 @@ int32 TypeOneUsageAverage(const ParseOptions &po) { } int32 n_utts = 0, n_total_matrices = 0, - n_success = 0, n_missing = 0, n_other_errors = 0; + n_success = 0, n_missing = 0; for (; !matrix_reader1.Done(); matrix_reader1.Next()) { std::string key = matrix_reader1.Key(); @@ -151,7 +150,6 @@ int32 TypeOneUsageAverage(const ParseOptions &po) { << matrix_in_fns[i] << " vs " << matrix_out.NumRows() << " by " << matrix_out.NumCols() << " primary matrix, rspecifier:" << matrix_in_fn1; - n_other_errors++; } } else { KALDI_WARN << "No matrix found for utterance " << key << " for " diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc index 22d4d92055d..535c18365ed 100644 --- a/src/bin/phones-to-prons.cc +++ b/src/bin/phones-to-prons.cc @@ -172,7 +172,8 @@ int main(int argc, char *argv[]) { if (g_kaldi_verbose_level >= 2) { KALDI_LOG << "phn2word FST is below:"; fst::FstPrinter fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cerr, "standard error"); + printer_print(std::cerr, fstprinter, "standard error"); + //fstprinter.Print(&std::cerr, "standard error"); KALDI_LOG << "phone sequence is: "; for (size_t i = 0; i < phones.size(); i++) std::cerr << phones[i] << ' '; diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc index 3e622cafdc7..d03bf671245 100644 --- a/src/bin/vector-sum.cc +++ b/src/bin/vector-sum.cc @@ -52,7 +52,7 @@ int32 TypeOneUsage(const ParseOptions &po) { } int32 n_utts = 0, n_total_vectors = 0, - n_success = 0, n_missing = 0, n_other_errors = 0; + n_success = 0, n_missing = 0; for (; !vector_reader1.Done(); vector_reader1.Next()) { std::string key = vector_reader1.Key(); @@ -75,7 +75,6 @@ int32 TypeOneUsage(const ParseOptions &po) { << "system " << (i + 2) << ", rspecifier: " << vector_in_fns[i] << " vs " << vector_out.Dim() << " primary vector, rspecifier:" << vector_in_fn1; - n_other_errors++; } } else { KALDI_WARN << "No vector found for utterance " << key << " for " diff --git a/src/chain/Makefile b/src/chain/Makefile index fbad28f7de6..dbe6c38709f 100644 --- a/src/chain/Makefile +++ b/src/chain/Makefile @@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ language-model.o chain-denominator.o chain-training.o \ chain-generic-numerator.o -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) OBJFILES += chain-kernels.o endif @@ -28,7 +28,14 @@ ifeq ($(CUDA), true) endif # Implicit rule for kernel compilation, +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ +endif +ifeq ($(ROCM), true) +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endif + include ../makefiles/default_rules.mk diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index f5814d7c11c..48c80cc8d92 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -22,6 +22,10 @@ #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_ #include "chain/chain-datastruct.h" +#ifdef __IS_HIP_COMPILE__ +#include +#endif + #if HAVE_CUDA == 1 extern "C" { diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index a63944f0012..ad6691fc895 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -20,6 +20,11 @@ #include #include "chain/chain-kernels-ansi.h" +#ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 +#include +#endif + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \ configure with --use-cuda=no (this will disable the use of GPU). diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 57fe15923b0..b29000a448c 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -571,9 +571,8 @@ void Supervision::Write(std::ostream &os, bool binary) const { // Write using StdAcceptorCompactFst, making use of the fact that it's an // acceptor. fst::FstWriteOptions write_options(""); - fst::StdCompactAcceptorFst( - fst).Write(os, - write_options); + fst::StdCompactAcceptorFst cfst(fst); + cfst.Write(os, write_options); } } else { KALDI_ASSERT(e2e_fsts.size() == num_sequences); @@ -586,9 +585,8 @@ void Supervision::Write(std::ostream &os, bool binary) const { // Write using StdAcceptorCompactFst, making use of the fact that it's an // acceptor. fst::FstWriteOptions write_options(""); - fst::StdCompactAcceptorFst( - e2e_fsts[i]).Write(os, - write_options); + fst::StdCompactAcceptorFst cfst(e2e_fsts[i]); + cfst.Write(os, write_options); } } WriteToken(os, binary, ""); diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 0117fe2200f..60a2645b31b 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -347,7 +347,7 @@ int main(int argc, char *argv[]) { // not configurable for now. exclude_names.push_back(std::string("ivector")); - int64 num_read = 0, num_written = 0, num_err = 0; + int64 num_read = 0, num_written = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { const std::string &key = example_reader.Key(); NnetChainExample &eg = example_reader.Value(); @@ -361,7 +361,6 @@ int main(int argc, char *argv[]) { BaseFloat weight = 1.0; if (!egs_weight_reader.HasKey(key)) { KALDI_WARN << "No weight for example key " << key; - num_err++; continue; } weight = egs_weight_reader.Value(key); @@ -371,7 +370,6 @@ int main(int argc, char *argv[]) { if (!eg_output_name_rspecifier.empty()) { if (!output_name_reader.HasKey(key)) { KALDI_WARN << "No new output-name for example key " << key; - num_err++; continue; } std::string new_output_name = output_name_reader.Value(key); diff --git a/src/configure b/src/configure index ed627eceedc..3743c31f76b 100755 --- a/src/configure +++ b/src/configure @@ -39,7 +39,7 @@ # This should be incremented after any significant change to the configure # script, i.e. any change affecting kaldi.mk or the build system as a whole. -CONFIGURE_VERSION=14 +CONFIGURE_VERSION=15 # We support bash version 3.2 (Macs still ship with this version as of 2019) # and above. @@ -74,6 +74,9 @@ Configuration options: --cudatk-dir=DIR CUDA toolkit directory --cuda-arch=FLAGS Override the default CUDA_ARCH flags. See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples. + --use-rocm Build with ROCm + --rocm-dir=DIR ROCM directory + --rocm-targets=TGTS Comma separated list of GPU targets to target through ROCm --debug-level=N Use assertion level 0 (disabled), 1, or 2 [default=1] --double-precision Build with BaseFloat set to double if yes [default=no], mostly useful for testing purposes. @@ -248,6 +251,71 @@ function check_for_slow_expf { fi } +# ROCM is used only in selected directories including src/cudamatrix, src/nnet* +# and src/chain*. It is used to accelerate the neural network training. +# The rest of Kaldi runs on CPUs. + +function configure_rocm { + # Check for ROCM in the system + if [ ! -d "$ROCMDIR" ]; then + for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do + if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then + ROCMDIR=$base + break + fi + done + fi + + if [ -d "$ROCMDIR" ]; then + if [ ! -f $ROCMDIR/bin/hipcc ]; then + failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR" + fi + fi + echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)" + echo >> kaldi.mk + echo "# ROCm configuration" >> kaldi.mk + echo >> kaldi.mk + echo IS_GPU_BUILD = true >> kaldi.mk + echo ROCM = true >> kaldi.mk + echo "ROCMDIR = $ROCMDIR" >> kaldi.mk + echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk + + echo "CUDA_ARCH = " >> kaldi.mk + echo "ROCM_ARCH_FLAGS = " >> kaldi.mk + for i in ${ROCM_TARGETS//,/ } ; do + echo "Targetting ROCm arch $i" + echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk + done + + echo "HOST_ARCH = `uname -m`" >> kaldi.mk + echo >> kaldi.mk + + ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1) + echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk + ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2) + echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk + + # Only ROCm 5.2+ is supported. + if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then + echo "\ +WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required." + exit 1; + fi + + # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, + # use direct calls to uname -m here + if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then + cat makefiles/hip_64bit.mk >> kaldi.mk + else + echo "\ +WARNING: ROCm will not be used! + ROCm is only supported with 64-bit Linux builds." + exit 1; + fi +} + + + # CUDA is used only in selected directories including src/cudamatrix, src/nnet* # and src/chain*. It is used to accelerate the neural network training. # The rest of Kaldi runs on CPUs. @@ -283,6 +351,7 @@ Either your CUDA is too new or too old." GCC_VER=$($CXX -dumpversion) GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d") case $CUDA_VERSION in + # Update this list by consulting https://gist.github.com/ax3l/9489132 # Disabling CUDA 7 and CUDA 8 because we now use C++14 to compile CUDA # code. It is still possible to use those cuda versions by switching # back to C++11 in src/makefiles/cuda_64bit.mk and use CUB <= 1.8.0. @@ -317,16 +386,23 @@ Either your CUDA is too new or too old." 11_*) MIN_UNSUPPORTED_GCC_VER="12.0" MIN_UNSUPPORTED_GCC_VER_NUM=120000 - ;; + CUSOLVER=true + ;; + 12_*) + MIN_UNSUPPORTED_GCC_VER="12.3" + MIN_UNSUPPORTED_GCC_VER_NUM=123000 + CUSOLVER=true + ;; *) failure "Unsupported CUDA version ${CUDA_VERSION}. Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ output of either 'nvcc -h' or 'ptxas -h'." ;; esac - (( GCC_VER_NUM < MIN_UNSUPPORTED_GCC_VER_NUM )) || + if [ $GCC_VER_NUM -ge $MIN_UNSUPPORTED_GCC_VER_NUM ]; then failure "CUDA $CUDA_VERSION does not support $CXX (g++-$GCC_VER).\ Only versions strictly older than $MIN_UNSUPPORTED_GCC_VER are supported." + fi case $CUDA_VERSION in [1-8]_* | 9_0) CUSOLVER=false ;; @@ -345,6 +421,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ 10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;; 11_0) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80" ;; 11_*) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" ;; + 12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" ;; *) failure \ "Unsupported CUDA version ${CUDA_VERSION}. Please open an" \ "issue at https://github.com/kaldi-asr/kaldi/issues and" \ @@ -371,6 +448,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\ echo "# CUDA configuration" >> kaldi.mk echo >> kaldi.mk + echo IS_GPU_BUILD = true >> kaldi.mk echo CUDA = true >> kaldi.mk echo CUDATKDIR = $CUDATKDIR >> kaldi.mk echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk @@ -602,7 +680,8 @@ ENV_LDLIBS=$LDLIBS debug_level=1 double_precision=false dynamic_kaldi=false -use_cuda=true +use_cuda=false +use_rocm=false with_cudadecoder=true static_fst=false static_math=false @@ -651,8 +730,11 @@ do --atlas-root=*) GetSwitchExistingPathOrDie ATLASROOT "$1" shift ;; - --use-cuda) - use_cuda=true; + --use-rocm) + use_rocm=true; + shift ;; + --use-rocm=no) + use_rocm=false; shift ;; --use-cuda=yes) use_cuda=true; @@ -729,6 +811,13 @@ do --mathlib=*) GetSwitchValueOrDie MATHLIB "$1" shift ;; + --rocm-dir=*) + # ROCM is used in src/cudamatrix and src/nnet{,bin} only. + GetSwitchExistingPathOrDie ROCMDIR "$1" + shift ;; + --rocm-targets=*) + GetSwitchValueOrDie ROCM_TARGETS "$1" + shift ;; --cudatk-dir=*) # CUDA is used in src/cudamatrix and src/nnet{,bin} only. GetSwitchExistingPathOrDie CUDATKDIR "$1" @@ -935,6 +1024,14 @@ OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d" if [ $OPENFST_VER_NUM -lt 10600 ]; then failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)" fi + +if [ $OPENFST_VER_NUM -lt 10800 ]; then + echo "CXXLANGVERSION = c++14" +else + echo "CXXLANGVERSION = c++17" +fi >> kaldi.mk + +echo "OPENFSTVER = $OPENFST_VER_NUM" >> kaldi.mk echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk if $static_fst ; then OPENFSTLIBS="$FSTROOT/lib/libfst.a" @@ -968,7 +1065,11 @@ if $use_cuda; then fi echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk else - echo "WITH_CUDADECODER = false" >> kaldi.mk + if $use_rocm; then + echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk + else + echo "WITH_CUDADECODER = false" >> kaldi.mk + fi fi echo >> kaldi.mk @@ -1057,6 +1158,8 @@ elif [ "`uname`" == "Darwin" ]; then cat makefiles/darwin_clapack.mk >> kaldi.mk echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work." echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT" + elif [ "$(uname -m)" == "arm64" ]; then + cat makefiles/darwin_arm64.mk >> kaldi.mk else cat makefiles/darwin.mk >> kaldi.mk fi @@ -1304,6 +1407,7 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)." failure "Unsupported linear algebra library '$MATHLIB'" fi $use_cuda && configure_cuda + $use_rocm && configure_rocm linux_configure_speex else failure "Could not detect the platform or we have not yet worked out the diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile index e2569e89ab7..a7972f1831d 100644 --- a/src/cudadecoder/Makefile +++ b/src/cudadecoder/Makefile @@ -3,13 +3,15 @@ all: ; EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif TESTFILES = @@ -34,8 +36,14 @@ LDLIBS += $(CUDA_LDLIBS) # Implicit rule for kernel compilation +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) +endif +ifeq ($(ROCM), true) +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +endif else all: diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu index f02a78ed1af..429d9f72326 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.cu +++ b/src/cudadecoder/batched-static-nnet3-kernels.cu @@ -17,6 +17,11 @@ #include "cudadecoder/batched-static-nnet3-kernels.h" +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "hipify.h" +#endif + #include namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h index 45064e15071..fec2470a9db 100644 --- a/src/cudadecoder/batched-static-nnet3-kernels.h +++ b/src/cudadecoder/batched-static-nnet3-kernels.h @@ -17,7 +17,13 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include +#endif #include "base/kaldi-types.h" #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_ diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc index 6e78d7212fd..bec20cb9e07 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc @@ -21,7 +21,13 @@ #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h" -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else +#include +#endif #include #include diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc index 89e93e5d98c..32d7ac40e12 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc @@ -26,7 +26,13 @@ #include -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else +#include +#endif #include "base/kaldi-utils.h" #include "cudadecoder/cuda-fst.h" diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc index c076910672a..4b30c568e73 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc @@ -23,7 +23,13 @@ #include -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else +#include +#endif namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h index fc0d2cddd2c..add66312817 100644 --- a/src/cudadecoder/cuda-decoder-kernels-utils.h +++ b/src/cudadecoder/cuda-decoder-kernels-utils.h @@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) { value.i2 = val; if (old.i2.x <= val.x) return; do { - assumed = old; + assumed.ull = old.ull; old.ull = atomicCAS(ptr64, assumed.ull, value.ull); } while (old.ull != assumed.ull && old.i2.x > value.i2.x); } @@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) { UInt64UnionInt2 old, assumed, value; old.ull = *ptr64; do { - assumed = old; + assumed.ull = old.ull; value.i2.x = assumed.i2.x - sub.x; value.i2.y = assumed.i2.y - sub.y; old.ull = atomicCAS(ptr64, assumed.ull, value.ull); diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index 3a835d02b76..e20a7dea15c 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -15,10 +15,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include + +#include "float.h" +#include "hipify.h" +#else #include +#endif #include "cuda-decoder-kernels.h" #include "cuda-decoder-kernels-utils.h" +#ifndef FLT_MAX +#define FLT_MAX 340282346638528859811704183484516925440.0f +#endif + namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 1ec456ac32c..15f29d27122 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -37,8 +37,15 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include -#include +#include +#endif #include "base/kaldi-utils.h" #include "cudadecoder/cuda-decoder-kernels.h" @@ -184,35 +191,36 @@ void CudaDecoder::AllocateDeviceData() { void CudaDecoder::AllocateHostData() { channel_to_compute_.resize(nlanes_); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_and_acoustic_cost_concat_, + (void **)&h_extra_and_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_acoustic_cost_concat_, + (void **)&h_acoustic_cost_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_prev_tokens_concat_, + (void **)&h_extra_prev_tokens_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_infotoken_concat_, + (void **)&h_infotoken_concat_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_))); KALDI_DECODER_CUDA_API_CHECK_ERROR( - cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_, + cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_acoustic_cost_concat_tmp_, + (void **)&h_acoustic_cost_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_extra_prev_tokens_concat_tmp_, + (void **)&h_extra_prev_tokens_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_))); KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_infotoken_concat_tmp_, + (void **)&h_infotoken_concat_tmp_, nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_))); h_lanes_counters_.Resize( nlanes_ + 1, 1); // +1 because we sometimes need last+1 value (for offsets) - KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost( - &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_))); + KALDI_DECODER_CUDA_API_CHECK_ERROR( + cudaMallocHost((void **)&h_channels_counters_, + nchannels_ * sizeof(*h_channels_counters_))); h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_); h_all_tokens_acoustic_cost_.resize(nchannels_); diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h index de2bd09f47c..f6ee37512e2 100644 --- a/src/cudadecoder/cuda-decoder.h +++ b/src/cudadecoder/cuda-decoder.h @@ -20,7 +20,13 @@ #if HAVE_CUDA +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include +#endif #include #include diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc index 56066ee069d..6b0d34f81b7 100644 --- a/src/cudadecoder/cuda-fst.cc +++ b/src/cudadecoder/cuda-fst.cc @@ -22,8 +22,15 @@ #include "cudadecoder/cuda-fst.h" #include "cudamatrix/cu-common.h" +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include -#include +#include +#endif namespace kaldi { namespace cuda_decoder { diff --git a/src/cudadecoder/lattice-postprocessor.cc b/src/cudadecoder/lattice-postprocessor.cc index 46d44216890..49f96191787 100644 --- a/src/cudadecoder/lattice-postprocessor.cc +++ b/src/cudadecoder/lattice-postprocessor.cc @@ -78,13 +78,14 @@ bool LatticePostprocessor::GetPostprocessedLattice( KALDI_ASSERT(decoder_frame_shift_ != 0.0 && "SetDecoderFrameShift() must be called (typically by pipeline)"); - if (!word_info_) - KALDI_ERR << "You must set --word-boundary-rxfilename in the lattice " - "postprocessor config"; - // ok &= - // Ignoring the return false for now (but will print a warning), - // because the doc says we can, and it can happen when using endpointing - WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat); + if (word_info_) { + // ok &= + // Ignoring the return false for now (but will print a warning), + // because the doc says we can, and it can happen when using endpointing + WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat); + } else { + *out_clat = clat; + } return ok; } diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile index 1f093299eb4..96b00c06101 100644 --- a/src/cudadecoderbin/Makefile +++ b/src/cudadecoderbin/Makefile @@ -2,13 +2,15 @@ all: ; include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc index 1aba7144af1..a47ea2e2300 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc @@ -23,9 +23,15 @@ #error CUDA support must be configured to compile this binary. #endif +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "hipify.h" +#include "roctracer/roctx.h" +#else #include #include -#include +#include +#endif #include #include diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc index 46138116bd8..06aac47b5e0 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc @@ -17,9 +17,15 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include "hip/hip_runtime.h" +#include "hipify.h" +#include "roctracer/roctx.h" +#else #include #include -#include +#include +#endif #include #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h" #include "cudamatrix/cu-allocator.h" diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc index 992b34598d2..b7a9d463214 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc @@ -18,9 +18,17 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include + +#include "hipify.h" +#else #include #include -#include +#include +#endif #include diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile index 54bcc53af1e..d7739dae623 100644 --- a/src/cudafeat/Makefile +++ b/src/cudafeat/Makefile @@ -2,13 +2,15 @@ all: ; include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) ifndef CUDA_ARCH $(error CUDA_ARCH is undefined, run 'src/configure') endif +endif TESTFILES = @@ -37,9 +39,14 @@ LDLIBS += $(CUDA_LDLIBS) # Implicit rule for kernel compilation +ifeq ($(CUDA), true) %.o : %.cu $(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC) - +endif +ifeq ($(ROCM), true) +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC) +endif else all: $(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]") diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu index c839548d6eb..1df9c6a7a43 100644 --- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu @@ -15,7 +15,13 @@ // See the License for the specific language governing permissions and // limitations under the License. // +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include +#endif #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h" __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { @@ -24,6 +30,7 @@ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { retval.y = a.y - b.y; return retval; } + __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { float2 retval; retval.x = a.x + b.x; @@ -31,11 +38,6 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { return retval; } -__device__ inline void atomicAdd(float2 *addr, float2 val) { - atomicAdd(reinterpret_cast(addr), val.x); - atomicAdd(reinterpret_cast(addr) + 1, val.y); -} - __device__ inline void operator+=(float2 &a, float2 &b) { // overloading += a.x += b.x; diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu index 0b57d6a32ea..5b94c34e829 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu @@ -16,7 +16,13 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include +#endif #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h" #include "cudamatrix/cu-common.h" namespace kaldi { @@ -45,7 +51,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols, const float *feats, int32_t ldf, int32_t stridef, float *feats_sq, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (chunk_frames + threads.y - 1) / threads.y, num_lanes); @@ -96,8 +102,11 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss, float *posteriors, int32_t ldp, int32_t stridep, int32_t right, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); - dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) / + GPU_MAX_WARPS_PER_BLOCK, + num_lanes); zero_invalid_posteriors_kernel<<>>( num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes, @@ -210,8 +219,11 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim, int32_t stridest, float *spliced_feats, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - int threads = (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(num_chunk_frames, num_lanes); @@ -306,10 +318,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, // First we need to shift feats to handle the case where num_chunk_frames // is less than stash size - KALDI_ASSERT(stash_size <= 32); - // This only works if stash size is <= 32 as we rely on __syncthreads() - // to avoid read/write hazards when reading/writing in-place - dim3 threads(32, 32); + KALDI_ASSERT(stash_size <= GPU_WARP_SIZE); + // This only works if stash size is <= GPU_WARP_SIZE as we rely on + // __syncthreads() to avoid read/write hazards when reading/writing in-place + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks(num_lanes); shift_feats_kernel<<>>(chunk_size, feats, feat_dim, ldf, @@ -318,9 +330,11 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim, } { - int threads = - (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads dim3 blocks(stash_size, num_lanes); // Then we need to copy feats from source into stash @@ -502,8 +516,9 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp, void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A, int32_t lda, int32_t stridea, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); - int block = (n + 31) / 32; // blocks in x and y dimensions + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + int block = + (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE; // blocks in x and y dimensions dim3 blocks(block, block, num_lanes); batched_convert_sp_to_dense_kernel<<>>( @@ -579,7 +594,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma, int32_t strideg, float *X, int32_t ldx, int32_t stridex, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t blocks = num_lanes; initialize_channels_kernel<<>>( @@ -624,7 +639,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma, int32_t ldx, int32_t stridex, float *X_stash, int32_t lds, int32_t strides, const LaneDesc *lanes, int32_t num_lanes) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t blocks = num_lanes; apply_and_update_stash_kernel<<>>( diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc index 538e268dd98..1699f8c1e77 100644 --- a/src/cudafeat/feature-online-batched-ivector-cuda.cc +++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc @@ -15,6 +15,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include "hipify.h" +// The BLAS enumerators are used instead of the SOLVER ones. +#ifdef CUBLAS_FILL_MODE_LOWER +#undef CUBLAS_FILL_MODE_LOWER +#endif +#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#ifdef CUDA_R_32F +#undef CUDA_R_32F +#endif +#define CUDA_R_32F HIPBLAS_R_32F +#endif + #include "cudafeat/feature-online-batched-ivector-cuda.h" #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h" diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu index c43adaccc2e..bc06ea32d69 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu +++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu @@ -17,8 +17,16 @@ #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h" +#ifdef __IS_HIP_COMPILE__ +#include + +#include + +#include "hipify.h" +#else #include -#include +#include +#endif #include "cudafeat/lane-desc.h" #include "cudamatrix/cu-rand.h" @@ -62,7 +70,7 @@ __global__ void batched_mel_banks_compute_kernel( // perfom local sum float sum = 0; if (frame < num_frames) { // exclude frames beyond the end - for (int idx = tid; idx < size; idx += 32) { + for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) { sum += v[idx] * w[idx]; } } @@ -481,7 +489,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes, float energy_floor, int32 *offsets, int32 *sizes, float **vecs, const float *feats, int32_t ldf, float *mels, int32_t ldm, bool use_log) { - dim3 Bl(32, 8); + dim3 Bl(GPU_WARP_SIZE, 8); dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes); batched_mel_banks_compute_kernel<<>>( lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs, diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h index e4549c7177c..d18f5237e8f 100644 --- a/src/cudafeat/feature-online-batched-spectral-cuda.h +++ b/src/cudafeat/feature-online-batched-spectral-cuda.h @@ -19,8 +19,14 @@ #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include #endif +#endif #include "cudafeat/feature-spectral-cuda.h" #include "cudafeat/feature-window-cuda.h" diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu index ba13b4fe484..e432fe56573 100644 --- a/src/cudafeat/feature-online-cmvn-cuda.cu +++ b/src/cudafeat/feature-online-cmvn-cuda.cu @@ -15,11 +15,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 +#include + +#include "hipify.h" +#else #include +#endif + #include "cudafeat/feature-online-cmvn-cuda.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-vector.h" +// HIP builds do not required packed floating point operators definition. +#ifndef __IS_HIP_COMPILE__ __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) { float2 retval; retval.x = a.x - b.x; @@ -32,6 +42,7 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) { retval.y = a.y + b.y; return retval; } +#endif #if __CUDA_ARCH__ == 750 __launch_bounds__ (1024, 1) @@ -179,8 +190,9 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase &feats_in, stats.Stride()); CU_SAFE_CALL(cudaGetLastError()); - threads = (feat_dim + 31) / 32 * 32; // round up to 32 threads - if (threads > 1024) threads = 1024; + threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to GPU_WARP_SIZE threads + if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK; const CuMatrix &gstats = cmvn_state_.global_cmvn_stats; const CuMatrix &sstats = cmvn_state_.speaker_cmvn_stats; diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu index 3912661c4fd..7b514010562 100644 --- a/src/cudafeat/feature-spectral-cuda.cu +++ b/src/cudafeat/feature-spectral-cuda.cu @@ -17,8 +17,16 @@ #include "cudafeat/feature-spectral-cuda.h" -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include + +#include "hipify.h" +#else +#include #include +#endif #include "cudamatrix/cu-rand.h" @@ -128,7 +136,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor, // perfom local sum float sum = 0; - for (int idx = tid; idx < size; idx += 32) { + for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) { sum += v[idx] * w[idx]; } @@ -487,7 +495,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w // mel banks int num_bins = bin_size_; cu_mel_energies_.Resize(num_frames, num_bins, kUndefined); - dim3 mel_threads(32, 8); + dim3 mel_threads(GPU_WARP_SIZE, 8); dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y); mel_banks_compute_kernel<<>>( num_frames, std::numeric_limits::epsilon(), offsets_, sizes_, diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h index 8683372098c..b0e4a24c8d2 100644 --- a/src/cudafeat/feature-spectral-cuda.h +++ b/src/cudafeat/feature-spectral-cuda.h @@ -19,8 +19,14 @@ #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include #endif +#endif #include "cudafeat/feature-window-cuda.h" #include "cudamatrix/cu-matrix.h" diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu index b8db5bd46d3..e001eb0790f 100644 --- a/src/cudafeat/feature-window-cuda.cu +++ b/src/cudafeat/feature-window-cuda.cu @@ -17,7 +17,13 @@ #include "cudafeat/feature-window-cuda.h" -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else +#include +#endif #include "matrix/matrix-functions.h" diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc index 981345404f5..e03fda01ca7 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc @@ -20,7 +20,13 @@ #include "cudafeat/online-batched-feature-pipeline-cuda.h" -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else +#include +#endif namespace kaldi { @@ -95,7 +101,8 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda( current_samples_stash_ = new int32_t[num_channels_]; // allocated pinned memory for storing channel desc - CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_)); + CU_SAFE_CALL( + cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_)); // allocate device memory lanes_ = diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h index fa000f03b62..6c588c40c24 100644 --- a/src/cudafeat/online-batched-feature-pipeline-cuda.h +++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h @@ -23,6 +23,10 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include "hipify.h" +#endif + #include "base/kaldi-error.h" #include "feat/feature-window.h" #include "matrix/matrix-lib.h" diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu index 12d9b071f59..b7128dec7e6 100644 --- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu +++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu @@ -15,22 +15,32 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include +#endif + #include "cudafeat/online-ivector-feature-cuda-kernels.h" #include "cudamatrix/cu-common.h" namespace kaldi { -// Meant to be called with blockDim= 32x32 +// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK __global__ void batched_gemv_reduce_kernel(int rows, int cols, const float* __restrict__ A, int lda, const float* __restrict__ X, int ldx, float* C) { // Specialize WarpReduce for type float typedef cub::WarpReduce WarpReduce; - // Allocate WarpReduce shared memory for 32 warps - __shared__ typename WarpReduce::TempStorage temp_storage[32]; + // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps + __shared__ + typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK]; - __shared__ float s_A[32][32 + 1]; //+1 to avoid bank conflicts on transpose + __shared__ float + s_A[GPU_MAX_WARPS_PER_BLOCK] + [GPU_WARP_SIZE + 1]; //+1 to avoid bank conflicts on transpose int bid = blockIdx.x; // batch id int tid = threadIdx.x; // thread id @@ -41,13 +51,15 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols, // Offset to input vector to starting column for batch const float* __restrict__ X_in = X + bid * ldx; - for (int i = 0; i < cols; i += 32) { // threadIdx.x, keep all threads present + for (int i = 0; i < cols; + i += GPU_WARP_SIZE) { // threadIdx.x, keep all threads present int c = i + tid; float sum = 0.0f; // Perform dot product for (int j = 0; j < rows; - j += 32) { // threadIdx.y, keep all threads present + j += + GPU_MAX_WARPS_PER_BLOCK) { // threadIdx.y, keep all threads present int r = j + wid; float val = 0.0f; @@ -133,9 +145,11 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows, int32_t lda, float scale, float* retval) { // Specialize WarpReduce for type float - typedef cub::BlockReduce + typedef cub::BlockReduce BlockReduce; - // Allocate WarpReduce shared memory for 32 warps + // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps __shared__ typename BlockReduce::TempStorage temp_storage; float sum = 0.0f; @@ -201,7 +215,8 @@ __global__ void update_linear_and_quadratic_terms_kernel( void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, const float* AT, int B_stride, const float* B, float* C) { - batched_gemv_reduce_kernel<<>>( + batched_gemv_reduce_kernel<<>>( rows, cols, AT, A_stride, B, B_stride, C); CU_SAFE_CALL(cudaGetLastError()); } @@ -209,8 +224,11 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride, void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left, int32_t size, const float* feats, int32_t ldf, float* sfeats, int32_t lds) { - int threads = (feat_dim + 31) / 32 * 32; // round up to the nearest warp size - if (threads > 1024) threads = 1024; // Max block size is 1024 threads + int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE * + GPU_MAX_WARPS_PER_BLOCK; // round up to the nearest warp size + if (threads > GPU_MAX_THREADS_PER_BLOCK) + threads = GPU_MAX_THREADS_PER_BLOCK; // Max block size is + // GPU_MAX_THREADS_PER_BLOCK threads splice_features_kernel<<>>( num_frames, feat_dim, left, size, feats, ldf, sfeats, lds); @@ -232,7 +250,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames, void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols, float* A, int32_t lda, float scale, float* sum) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (num_rows + threads.y - 1) / threads.y); @@ -243,7 +261,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols, void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats, int32_t ldf, float* feats_sq, int32_t lds) { - dim3 threads(32, 32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); dim3 blocks((num_cols + threads.x - 1) / threads.x, (num_rows + threads.y - 1) / threads.y); diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc index bd4964860e0..daf1c7dfbf9 100644 --- a/src/cudafeat/online-ivector-feature-cuda.cc +++ b/src/cudafeat/online-ivector-feature-cuda.cc @@ -16,8 +16,20 @@ // limitations under the License. #if HAVE_CUDA == 1 -#include +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +// The BLAS enumerators are used instead of the SOLVER ones. +#ifdef CUBLAS_FILL_MODE_LOWER +#undef CUBLAS_FILL_MODE_LOWER +#endif +#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER +#else +#include #endif +#endif + #include #include "base/io-funcs.h" diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile index 9dbb5d30fa1..ed1c413c939 100644 --- a/src/cudafeatbin/Makefile +++ b/src/cudafeatbin/Makefile @@ -3,12 +3,14 @@ all: ; EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) ifeq ($(WITH_CUDADECODER), true) # Make sure we have CUDA_ARCH from kaldi.mk, -ifndef CUDA_ARCH - $(error CUDA_ARCH is undefined, run 'src/configure') +ifeq ($(CUDA), true) + ifndef CUDA_ARCH + $(error CUDA_ARCH is undefined, run 'src/configure') + endif endif LDFLAGS += $(CUDA_LDFLAGS) diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc index 24e7cbd4a70..44ef403f21a 100644 --- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc +++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc @@ -18,8 +18,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc index 36cfc4ad90c..ff9415b8f11 100644 --- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc +++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc index 99883f3114a..3fcc1aea659 100644 --- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc +++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifndef __IS_HIP_COMPILE__ #include #endif +#endif #include #include diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc index 787aceeca0d..e3f2ed75d30 100644 --- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc +++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc @@ -16,8 +16,10 @@ // limitations under the License. #if HAVE_CUDA +#ifndef __IS_HIP_COMPILE__ #include -#include +#include +#endif #endif #include diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc index b9135c3cee6..d54ba56be84 100644 --- a/src/cudafeatbin/compute-online-feats-cuda.cc +++ b/src/cudafeatbin/compute-online-feats-cuda.cc @@ -16,7 +16,9 @@ // limitations under the License. #if HAVE_CUDA == 1 -#include +#ifndef __IS_HIP_COMPILE__ +#include +#endif #endif #include "base/kaldi-common.h" #include "util/common-utils.h" diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 45c2ba44fd7..45c10b78899 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \ cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o -ifeq ($(CUDA), true) +ifeq ($(IS_GPU_BUILD), true) OBJFILES += cu-kernels.o endif @@ -27,8 +27,15 @@ ifeq ($(CUDA), true) endif endif +ifeq ($(CUDA), true) # Implicit rule for kernel compilation, %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ +endif + +ifeq ($(ROCM), true) +%.o : %.cu + $(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ +endif include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index e438c604509..c4cceedca48 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -23,9 +23,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #include +#endif #include #include diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index d7d65da806a..3edd9f1ca40 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -23,10 +23,18 @@ #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include + +#include "hipify.h" +#else #include #include #include #endif +#endif #include #include diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 53de59fe4fc..b8c250c6771 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -28,7 +28,13 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include +#endif #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc index 53eccdd44c5..2a29338aeb1 100644 --- a/src/cudamatrix/cu-array.cc +++ b/src/cudamatrix/cu-array.cc @@ -22,8 +22,14 @@ #include #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#else #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h index 84f78f00a91..3db44bf4aa5 100644 --- a/src/cudamatrix/cu-array.h +++ b/src/cudamatrix/cu-array.h @@ -105,13 +105,12 @@ class CuArrayBase { protected: /// Default constructor: make it protected so the user cannot /// instantiate this class. - CuArrayBase(): data_(NULL), dim_(0) { } + CuArrayBase(): data_(NULL), dim_(0) { } T *data_; ///< GPU data pointer (if GPU not available, ///< will point to CPU memory). MatrixIndexT dim_; ///< dimension of the vector - }; /** @@ -123,22 +122,21 @@ class CuArrayBase { template class CuArray: public CuArrayBase { public: - /// Default constructor, initialized data_ to NULL and dim_ to 0 via /// constructor of CuArrayBase. - CuArray() { } + CuArray() { } /// Constructor with memory initialisation. resize_type may be kSetZero or /// kUndefined. - explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero) + explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero) { Resize(dim, resize_type); } /// Constructor from CPU-based int vector - explicit CuArray(const std::vector &src) { CopyFromVec(src); } + explicit CuArray(const std::vector &src) { CopyFromVec(src); } /// Copy constructor. We don't make this explicit because we want to be able /// to create a std::vector. - CuArray(const CuArray &src) { CopyFromArray(src); } + CuArray(const CuArray &src) { CopyFromArray(src); } /// Destructor ~CuArray() { Destroy(); } @@ -172,7 +170,6 @@ class CuArray: public CuArrayBase { /// I/O void Read(std::istream &is, bool binary); void Write(std::ostream &is, bool binary) const; - }; @@ -182,7 +179,7 @@ class CuSubArray: public CuArrayBase { /// Constructor as a range of an existing CuArray or CuSubArray. Note: like /// similar constructors in class CuVector and others, it can be used to evade /// 'const' constraints; don't do that. - explicit CuSubArray(const CuArrayBase &src, + explicit CuSubArray(const CuArrayBase &src, MatrixIndexT offset, MatrixIndexT dim); /// Construct from raw pointers diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index e0c64912207..63cf33f98b2 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -19,9 +19,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include #include "base/timer.h" diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index 10fc00da681..938ec679f68 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -22,7 +22,15 @@ #include "cudamatrix/cu-common.h" +#ifdef __IS_HIP_COMPILE__ +#include + +#include "hipify.h" +#define API_NAME_PREFIX "HIP" +#else #include +#define API_NAME_PREFIX "CU" +#endif #include "base/kaldi-common.h" #include "cudamatrix/cu-matrixdim.h" @@ -31,6 +39,9 @@ namespace kaldi { #ifdef USE_NVTX NvtxTracer::NvtxTracer(const char* name) { +#ifdef __IS_HIP_COMPILE__ + roctxRangePushA(name); +#else const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; const int num_colors = sizeof(colors)/sizeof(uint32_t); int color_id = ((int)name[0])%num_colors; @@ -43,9 +54,14 @@ NvtxTracer::NvtxTracer(const char* name) { eventAttrib.message.ascii = name; nvtxRangePushEx(&eventAttrib); // nvtxRangePushA(name); +#endif } NvtxTracer::~NvtxTracer() { - nvtxRangePop(); +#ifdef __IS_HIP_COMPILE__ + roctxRangePop(); +#else + nvtxRangePop(); +#endif } #endif @@ -87,61 +103,106 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, const char* cublasGetStatusStringK(cublasStatus_t status) { // Defined in CUDA include file: cublas.h or cublas_api.h switch(status) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_SUCCESS: + return API_NAME_PREFIX "BLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: + return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED"; + case CUBLAS_STATUS_LICENSE_ERROR: + return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR"; +#ifdef __IS_HIP_COMPILE__ + case HIPBLAS_STATUS_HANDLE_IS_NULLPTR: + return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; + case HIPBLAS_STATUS_INVALID_ENUM: + return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR"; +#endif } - return "CUBLAS_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR"; } const char* cusparseGetStatusString(cusparseStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust // Defined in CUDA include file: cusparse.h switch(status) { - case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; - #if CUDA_VERSION >= 11000 - case CUSPARSE_STATUS_NOT_SUPPORTED: return "CUSPARSE_STATUS_NOT_SUPPORTED"; - case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES"; - #endif + case CUSPARSE_STATUS_SUCCESS: + return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: + return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: + return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT"; +#if CUDA_VERSION >= 11000 + case CUSPARSE_STATUS_NOT_SUPPORTED: + return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED"; + case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES: + return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES"; +#endif } - return "CUSPARSE_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR"; } const char* curandGetStatusString(curandStatus_t status) { // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html // Defined in CUDA include file: curand.h switch(status) { - case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_SUCCESS: + return API_NAME_PREFIX "RAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR"; +#ifdef __IS_HIP_COMPILE__ + case HIPRAND_STATUS_NOT_IMPLEMENTED: + return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED"; +#endif } - return "CURAND_STATUS_UNKNOWN_ERROR"; + return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR"; } } // namespace kaldi diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 83f8a39a8b9..f7f45b8043a 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -31,11 +31,25 @@ #if HAVE_CUDA +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#include + +#include "hipify.h" +#else #include #include #include #include -#include +#include + +#define GPU_WARP_SIZE 32 +#define GPU_MAX_THREADS_PER_BLOCK 1024 +#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE) +#endif #define CU_SAFE_CALL(fun) \ { \ diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc index be02921169d..bb4017de9bb 100644 --- a/src/cudamatrix/cu-compressed-matrix.cc +++ b/src/cudamatrix/cu-compressed-matrix.cc @@ -19,9 +19,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 39bcf373ace..fd2c0c64f1f 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -23,10 +23,17 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include + +#include "hipify.h" +#else #include #include #include - +#endif // __IS_HIP_COMPILE__ #include #include #include @@ -240,8 +247,12 @@ void CuDevice::SelectGpuId(std::string use_gpu) { return; } else { // Suggest to use compute exclusive mode +#ifdef __IS_HIP_COMPILE__ + KALDI_WARN << "Not in compute-exclusive mode."; +#else KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use " "'nvidia-smi -c 3' to set compute exclusive mode"; +#endif // We want to choose the device more carefully, so release the CUDA context. e = cudaDeviceReset(); if (e != cudaSuccess) { diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 2f278eb85b9..fe8ac795560 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -28,14 +28,27 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#include +#include +#include +#include +#include + +#include "hipify.h" +#else #include #include #include #include #include - +#endif #if CUDA_VERSION >= 9010 +#ifdef __IS_HIP_COMPILE__ +#include +#else #include +#endif #else // cusolver not supported. // Setting a few types to minimize compiler guards. diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8044ff699bc..b3c3165bd96 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -27,11 +27,23 @@ #include #include +#ifdef __IS_HIP_COMPILE__ +#define __CUDA_ARCH__ 800 +#include +#include + +#include +#include + +#include "cudamatrix/cu-kernels-ansi.h" +#include "hipify.h" +#else #include +#include "cudamatrix/cu-common.h" #include "cudamatrix/cu-kernels-ansi.h" #include #include // for CUDA_VERSION - +#endif //__IS_HIP_COMPILE__ /*********************************************************************** * Generic __device__ functions @@ -953,11 +965,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, } // Warp reduce. Implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { smem.sum[tid] += smem.sum[tid + shift]; } + __syncwarp(); } // output 1 sum per thread block @@ -1109,8 +1122,8 @@ void trace_mat_mat_trans_atomic(Real *d_result, cudaStream_t stream) { // Assuming *d_result is set to zero already - constexpr int THREADS_X = 32; - constexpr int THREADS_Y = 16; + constexpr int THREADS_X = GPU_WARP_SIZE; + constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2; dim3 thrds(THREADS_X, THREADS_Y); @@ -1167,6 +1180,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { ssum[tid] += ssum[tid + shift]; + __syncwarp(); } } @@ -1206,11 +1220,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, } // Warp reduce to 1 element. Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { - ssum[tid] += ssum[tid + shift]; - } + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { + ssum[tid] += ssum[tid + shift]; + } + __syncwarp(); } // output 1 sum per thread block @@ -1257,12 +1272,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, // Warp reduce to 1 element per column. // Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { - ssum[tid] += ssum[tid + shift]; + if (tid < warpSize) { + ssum[tid] += ssum[tid + shift]; + } + __syncwarp(); } - } // output TileDim sums per thread block if (tid < TileDim) { @@ -1340,13 +1356,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M, // Warp reduce to 1 element per column. // Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift >= TileDim; shift >>= 1) { + for (int shift = warpSize; shift >= TileDim; shift >>= 1) { + if (tid < warpSize) { smem.sum[tid] += smem.sum[tid + shift]; } + __syncwarp(); } - // output TileDim sums per thread block if (tid < TileDim && j_n < dim_N.cols) { v[j_n] = alpha * smem.sum[tid] + beta * v[j_n]; @@ -1793,10 +1809,11 @@ static void _vec_transform_reduce( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); } + __syncwarp(); } // Output to vector result. @@ -2006,9 +2023,11 @@ static void _transform_reduce_mat_rows( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + } + __syncwarp(); } // Output to vector result. @@ -2045,11 +2064,13 @@ static void _transform_reduce_mat_cols( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + } + __syncwarp(); } - + // Output to vector result. if (tid == 0) { result[i] = op.PostReduce(sdata[0], result[i]); @@ -2087,13 +2108,12 @@ static void _group_transform_reduce( x_idx += threads_per_group; } sreduction[tid] = treduction; - if (threads_per_group > warpSize) { - __syncthreads(); - } + __syncthreads(); // tree-reduce to 2x warpSize elements per group -# pragma unroll - for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) { + int shift = threads_per_group / 2; +#pragma unroll + for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); } @@ -2101,14 +2121,12 @@ static void _group_transform_reduce( } // Warp-reduce to 1 element per group. - // Threads implicitly synchronized within the warp. - const int warp_reduce_size = - threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize; - if (threadIdx.x < warp_reduce_size) { # pragma unroll - for (int shift = warp_reduce_size; shift > 0; shift >>= 1) { + for (; shift > 0; shift >>= 1) { + if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); } + __syncwarp(); } // Store the result. @@ -2967,12 +2985,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv, } // reduce to 1 element per row - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sprod[tid] += sprod[tid + shift]; snorm[tid] += snorm[tid + shift]; } + __syncwarp(); } // broadcast the sum results @@ -3254,15 +3273,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, } // Warp reduce without __syncthreads() // (note.: synchronizes implicitly within a warp at the multiprocessor) - if (tid < warpSize / 2) { #pragma unroll - for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; - num_working_threads >>= 1) { + for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; + num_working_threads >>= 1) { + if (tid < warpSize / 2) { if (smax[tid + num_working_threads] > smax[tid]) { smax[tid] = smax[tid + num_working_threads]; sidx[tid] = sidx[tid + num_working_threads]; } } + __syncwarp(); } if (tid == 0) { @@ -3990,9 +4010,9 @@ struct BatchedMatrixCopyDesc { MatrixCopyDesc batch[MAX_BATCH_SIZE]; }; -// launched with a block size of 32x32 (32 rows, 32 cols per CTA) -// grid dim x,y expands to fill out average in x/y across batches -// grid dim.z is batch +// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE +// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y +// expands to fill out average in x/y across batches grid dim.z is batch template __global__ void _cuda_batch_copy_mats(BatchedMatrixCopyDesc batch_desc) { @@ -4371,7 +4391,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B, void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat<32> <<>>(A,B,dA,B_stride,value); + _trace_mat_mat<<>>(A, B, dA, B_stride, value); } void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, @@ -4392,6 +4412,11 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, } else if (Bl.x == 32) { _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MTN<64> + <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#endif } } @@ -4402,7 +4427,11 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha, if (Bl.x == 16) { _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { - _add_diag_mat_mat_MN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<32><<>>(alpha, M, stride_M, N, dim_N, beta, v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MN<64><<>>(alpha, M, stride_M, N, dim_N, beta, v); +#endif } } @@ -5077,7 +5106,7 @@ void cudaD_trace_mat_mat_trans(const double* A, void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat<32> <<>>(A,B,dA,B_stride,value); + _trace_mat_mat<<>>(A, B, dA, B_stride, value); } void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha, @@ -5098,6 +5127,11 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, } else if (Bl.x == 32) { _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MTN<64> + <<>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v); +#endif } } @@ -5108,7 +5142,11 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha, if (Bl.x == 16) { _add_diag_mat_mat_MN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); } else if (Bl.x==32) { - _add_diag_mat_mat_MN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MN<32><<>>(alpha, M, stride_M, N, dim_N, beta, v); +#ifdef __IS_HIP_COMPILE__ + } else if (Bl.x == 64) { + _add_diag_mat_mat_MN<64><<>>(alpha, M, stride_M, N, dim_N, beta, v); +#endif } } @@ -5479,25 +5517,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<32> <<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<<>>(mat_out, mat_in, d_out, d_in); } void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim, @@ -5793,7 +5831,15 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, // Launches a kernel that does nothing, explicitly using the legacy default stream; // this will synchronize all threads without blocking. void cuda_legacy_noop() { +#ifdef __IS_HIP_COMPILE__ + // HIP doesn't currently support cudaStreamLegacy stream so we force the + // implementation to use the legacy (not per-thread) API to get similar + // semantics. + auto k = reinterpret_cast(_noop_kernel); + hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0); +#else _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>(); +#endif } void cudaF_mat_copy_range_clamped( @@ -5803,8 +5849,10 @@ void cudaF_mat_copy_range_clamped( float *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(32,32); - dim3 blocks((num_cols+31)/32,(num_rows+31)/32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks( + (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5817,8 +5865,10 @@ void cudaD_mat_copy_range_clamped( double *dst, int32_t ldd) { int32_t num_rows = row_end - row_start; - dim3 threads(32,32); - dim3 blocks((num_cols+31)/32,(num_rows+31)/32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); + dim3 blocks( + (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK); _cuda_mat_copy_range_clamped<<>>(row_start, row_end, num_cols, src, lds, clamp_low, clamp_high, dst, ldd); @@ -5827,8 +5877,7 @@ void cudaD_mat_copy_range_clamped( void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { - - dim3 threads(32,32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5854,9 +5903,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, - MAX_BATCH_SIZE); + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5876,10 +5926,11 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, - remaining); + + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + remaining); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5892,8 +5943,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { - - dim3 threads(32,32); + dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK); int32_t total_rows=0, total_cols=0; BatchedMatrixCopyDesc batch_desc; @@ -5919,9 +5969,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, // compute average number of rows/cols across batch int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE); int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, - MAX_BATCH_SIZE); + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + MAX_BATCH_SIZE); // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory @@ -5942,10 +5993,11 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, int32_t rows = ceilf(total_rows / (float)remaining); int32_t cols = ceilf(total_cols / (float)remaining); - dim3 blocks((cols + 31) / 32, - (rows + 31) / 32, - remaining); - + dim3 blocks( + (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE, + (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK, + remaining); + // no memcpy needed here. Memory will be passed down directly // through paramter passing and live in constant memory diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index 3fbeff3a470..d0d8e4e771f 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase &input, // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x), // n_blocks(num_rows, dimBlock.y)); diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index be8483e48f5..dfcaf30770a 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2705,7 +2705,7 @@ static void UnitTestCuMatrixSetRandUniform() { upper_bound = expected_moment + allowed_deviation; if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) { KALDI_LOG << "Random matrix is " << M; - KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment + KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment << ", expected " << expected_moment << ", allowed range " << lower_bound << " to " << upper_bound; } diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index c67842d38bf..53831a52bc8 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -27,9 +27,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" @@ -243,7 +250,7 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, } else { // 2D thread block with warps (blockDim.x) along the row-dim of input M. // Each (8x32) thread block will transpose (32x32) data - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(M.NumCols(), warpSize), n_blocks(M.NumRows(), warpSize)); @@ -849,7 +856,7 @@ void CuMatrixBase::DiffGroupPnorm(const CuMatrixBase &in_value, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); dim3 dimGrid(n_blocks(NumCols(), dimBlock.x), n_blocks(NumRows(), dimBlock.y)); @@ -999,7 +1006,7 @@ void CuMatrixBase::AddSmat(Real alpha, const CuSparseMatrix &A, // We use warpSize threads per row to access only the nonzero elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of A. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y)); @@ -2176,7 +2183,7 @@ Real TraceMatMat(const CuMatrixBase &A, // if the matrix is not in a very bad shape. // (wider or taller than 32x8192) // CPU will then reduce to 1 element. - const int kWarpSize = 32; + const int kWarpSize = GPU_WARP_SIZE; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize), n_blocks(A.NumRows(), kWarpSize)); @@ -2398,7 +2405,7 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { // and use transposed copy to fill *this // see CuMatrixBase::CopyFromMat() for more detail of the impl MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ }; - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(rv_dim.cols, warpSize), n_blocks(rv_dim.rows, warpSize)); @@ -2408,7 +2415,7 @@ void CuMatrixBase::CopyColsFromVec(const CuVectorBase &rv) { } else if (rv.Dim() == num_rows_) { // use 2D block (8x32) and large enough grid to cover matrix *this // dimBlock.x need to be at least warpSize for coalesced memory access. - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(num_cols_, dimBlock.x), n_blocks(num_rows_, dimBlock.y)); diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index a531ecd45b9..775fecd82c6 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -231,7 +231,7 @@ class CuMatrixBase { bool ApproxEqual(const CuMatrixBase &other, float tol = 0.01) const; /// Get size of matrix in bytes - MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); } + size_t SizeInBytes() const { return static_cast(num_rows_)*static_cast(stride_)*sizeof(Real); } // Copy functions. These do not resize. template @@ -250,7 +250,7 @@ class CuMatrixBase { template void CopyFromTp(const CuTpMatrix &M, MatrixTransposeType trans = kNoTrans); - + // This function will copy from source rows (start_range, end_range] // if the range is outside of the clamped region then the clamped // row will be replicated across the out of range areas @@ -307,9 +307,9 @@ class CuMatrixBase { void PowAbs(const CuMatrixBase &src, Real power, bool include_sign=false); void Floor(const CuMatrixBase &src, Real floor_val); - + void Ceiling(const CuMatrixBase &src, Real ceiling_val); - + /// This is equivalent to running: /// Floor(src, lower_limit); /// Ceiling(src, upper_limit); @@ -320,7 +320,7 @@ class CuMatrixBase { /// (x < 0 ? exp(x) : x + 1). This function is used /// in our RNNLM training. void ExpSpecial(const CuMatrixBase &src); - + /// Softmax nonlinearity /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row, /// with attention to avoiding overflow or underflow. @@ -333,7 +333,7 @@ class CuMatrixBase { /// Supports in-place operation (i.e. this == &src). void LogSoftMaxPerRow(const CuMatrixBase &src); - + /// Apply the function y = log(1 + exp(x)), to each element. /// Note: the derivative of this function is the sigmoid function. /// This is like a soft ReLU. @@ -439,23 +439,23 @@ class CuMatrixBase { this -> Pow(*this, power); }; - + inline void ApplyPowAbs(Real power, bool include_sign=false) { this -> PowAbs(*this, power, include_sign); }; - + inline void ApplyHeaviside() { this -> Heaviside(*this); }; - + inline void ApplyFloor(Real floor_val) { this -> Floor(*this, floor_val); }; - + inline void ApplyCeiling(Real ceiling_val) { this -> Ceiling(*this, ceiling_val); }; - + inline void ApplyExp() { this -> Exp(*this); }; @@ -670,13 +670,13 @@ class CuMatrixBase { inline const CuSubVector Row(MatrixIndexT i) const { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return CuSubVector(data_ + (i * stride_), NumCols()); + return CuSubVector(data_ + (static_cast(i) * static_cast(stride_)), NumCols()); } inline CuSubVector Row(MatrixIndexT i) { KALDI_ASSERT(static_cast(i) < static_cast(num_rows_)); - return CuSubVector(data_ + (i * stride_), NumCols()); + return CuSubVector(data_ + (static_cast(i) * static_cast(stride_)), NumCols()); } inline CuValue operator() (MatrixIndexT r, MatrixIndexT c) { @@ -684,7 +684,7 @@ class CuMatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return CuValue(data_ + r * stride_ + c); + return CuValue(data_ + static_cast(r) * static_cast(stride_) + c); } inline Real operator() (MatrixIndexT r, MatrixIndexT c) const { @@ -692,7 +692,7 @@ class CuMatrixBase { static_cast(num_rows_) && static_cast(c) < static_cast(num_cols_)); - return CuValue(data_ + r * stride_ + c); // will be casted to Real. + return CuValue(data_ + static_cast(r) * static_cast(stride_) + c); // will be casted to Real. } Real Sum() const; @@ -737,10 +737,10 @@ class CuMatrixBase { /// Get raw row pointer (const). Warning: may return a pointer to GPU memory. Use at /// your own risk. - inline const Real* RowData(MatrixIndexT r) const { return data_ + r * stride_; } + inline const Real* RowData(MatrixIndexT r) const { return data_ + static_cast(r) * static_cast(stride_); } /// Get raw row pointer. Warning: may return a pointer to GPU memory. Use at /// your own risk. - inline Real* RowData(MatrixIndexT r) { return data_ + r * stride_; } + inline Real* RowData(MatrixIndexT r) { return data_ + static_cast(r) * static_cast(stride_); } /// Return data pointer (const). Warning: may return a pointer to GPU memory. /// Use at your own risk. inline const Real *Data() const { return data_; } @@ -924,7 +924,7 @@ class CuSubMatrix: public CuMatrixBase { /// This type of constructor is needed for Range() to work [in CuMatrix base /// class]. Cannot make it explicit or that breaks. - inline CuSubMatrix (const CuSubMatrix &other): + inline CuSubMatrix(const CuSubMatrix &other): CuMatrixBase (other.data_, other.num_rows_, other.num_cols_, other.stride_) {} private: diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 756d580c7cf..001170fdeca 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -21,9 +21,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index d1efc0cff9c..96085848d72 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -19,9 +19,16 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc index aad34b5dd54..0c2230a8731 100644 --- a/src/cudamatrix/cu-sparse-matrix-test.cc +++ b/src/cudamatrix/cu-sparse-matrix-test.cc @@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() { template static void UnitTestCuSparseMatrixTraceMatSmat() { for (int32 i = 0; i < 2; i++) { - MatrixIndexT row = 10 + Rand() % 40; - MatrixIndexT col = 10 + Rand() % 50; + MatrixIndexT row = 2 + Rand() % 3; + MatrixIndexT col = 1 + Rand() % 4; CuMatrix mat1(row, col); CuMatrix mat2(col, row); @@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() { cu_smat2.CopyToMat(&mat2); Real trace1 = TraceMatMat(mat3, mat1, kTrans); + Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans); AssertEqual(trace1, trace2, 0.00001); trace1 = TraceMatMat(mat3, mat2, kNoTrans); trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans); + AssertEqual(trace1, trace2, 0.00001); } } diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 703aa40e735..81ecbe68080 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -22,9 +22,16 @@ #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include #include @@ -138,7 +145,7 @@ void CuSparseMatrix::SelectRows(const CuArray &row_indexes, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all selected rows. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y)); @@ -161,7 +168,7 @@ void CuSparseMatrix::SelectRows(const CuArray &row_indexes, template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, int32 dim, MatrixTransposeType trans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -194,8 +201,8 @@ template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, const CuVectorBase &weights, int32 dim, MatrixTransposeType trans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( - NULL) { + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), + csr_val_(NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined); @@ -266,8 +273,9 @@ void CuSparseMatrix::Resize(const MatrixIndexT num_rows, num_rows_ = 0; num_cols_ = 0; nnz_ = 0; - csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( + csr_row_ptr_ = static_cast(CuDevice::Instantiate().Malloc( 1 * sizeof(int))); + csr_col_idx_ = NULL; // may be freed, but this is allowed. csr_val_ = NULL; } else { KALDI_ASSERT(num_rows > 0); @@ -277,10 +285,16 @@ void CuSparseMatrix::Resize(const MatrixIndexT num_rows, num_rows_ = num_rows; num_cols_ = num_cols; nnz_ = nnz; - csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( - (num_rows + 1 + nnz) * sizeof(int))); - csr_val_ = static_cast(CuDevice::Instantiate().Malloc( + csr_row_ptr_ = static_cast(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int))); + if (nnz > 0) { + csr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( + nnz * sizeof(int))); + csr_val_ = static_cast(CuDevice::Instantiate().Malloc( nnz * sizeof(Real))); + } else { + csr_col_idx_ = NULL; + csr_val_ = NULL; + } CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.Set(nnz); if (resize_type == kSetZero) { @@ -302,8 +316,11 @@ void CuSparseMatrix::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - if (csr_row_ptr_col_idx_) { - CuDevice::Instantiate().Free(csr_row_ptr_col_idx_); + if (csr_row_ptr_) { + CuDevice::Instantiate().Free(csr_row_ptr_); + } + if (csr_col_idx_) { + CuDevice::Instantiate().Free(csr_col_idx_); } if (csr_val_) { CuDevice::Instantiate().Free(csr_val_); @@ -311,7 +328,8 @@ void CuSparseMatrix::Destroy() { num_rows_ = 0; num_cols_ = 0; nnz_ = 0; - csr_row_ptr_col_idx_ = NULL; + csr_row_ptr_ = NULL; + csr_col_idx_ = NULL; csr_val_ = NULL; CuDevice::Instantiate().AccuProfile(__func__, tim); } else @@ -378,11 +396,17 @@ void CuSparseMatrix::CopyFromSmat(const CuSparseMatrix& smat, CuSubVector val_from(smat.CsrVal(), smat.NumElements()); val_to.CopyFromVec(val_from); - CuSubArray idx_to(csr_row_ptr_col_idx_, - NumRows() + 1 + NumElements()); - CuSubArray idx_from(smat.csr_row_ptr_col_idx_, - smat.NumRows() + 1 + smat.NumElements()); - idx_to.CopyFromArray(idx_from); + { + CuSubArray idx_to(csr_row_ptr_, NumRows() + 1); + CuSubArray idx_from(smat.csr_row_ptr_, NumRows() + 1); + idx_to.CopyFromArray(idx_from); + } + + { + CuSubArray idx_to(csr_col_idx_, NumElements()); + CuSubArray idx_from(smat.csr_col_idx_, NumElements()); + idx_to.CopyFromArray(idx_from); + } } else { Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined); @@ -413,9 +437,14 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { smat->Resize(0, 0); return; } - CuSubArray idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements()); - std::vector idx_cpu; - idx.CopyToVec(&idx_cpu); + CuSubArray row_ptr(csr_row_ptr_, NumRows() + 1); + std::vector row_ptr_cpu; + row_ptr.CopyToVec(&row_ptr_cpu); + + + CuSubArray col_idx(csr_col_idx_, NumElements()); + std::vector col_idx_cpu; + col_idx.CopyToVec(&col_idx_cpu); CuSubVector val(CsrVal(), NumElements()); Vector val_cpu(NumElements(), kUndefined); @@ -425,8 +454,8 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { NumRows()); int n = 0; for (int i = 0; i < NumRows(); ++i) { - for (; n < idx_cpu[i + 1]; ++n) { - const MatrixIndexT j = idx_cpu[NumRows() + 1 + n]; + for (; n < row_ptr_cpu[i + 1]; ++n) { + const MatrixIndexT j = col_idx_cpu[n]; pairs[i].push_back( { j, val_cpu(n) }); } } @@ -484,7 +513,8 @@ void CuSparseMatrix::Swap(CuSparseMatrix *smat) { std::swap(num_rows_, smat->num_rows_); std::swap(num_cols_, smat->num_cols_); std::swap(nnz_, smat->nnz_); - std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_); + std::swap(csr_row_ptr_, smat->csr_row_ptr_); + std::swap(csr_col_idx_, smat->csr_col_idx_); std::swap(csr_val_, smat->csr_val_); } else #endif @@ -548,7 +578,7 @@ Real TraceMatSmat(const CuMatrixBase &A, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows of B. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y)); @@ -638,7 +668,7 @@ void CuSparseMatrix::CopyToMat(CuMatrixBase *M, // We use warpSize threads per row to access only the nnz elements. // Every CU1DBLOCK/warpSize rows share one thread block. // 1D grid to cover all rows. - const int warpSize = 32; + const int warpSize = GPU_WARP_SIZE; dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); dim3 dimGrid(n_blocks(NumRows(), dimBlock.y)); diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h index 82b17a0dc71..180beed6183 100644 --- a/src/cudamatrix/cu-sparse-matrix.h +++ b/src/cudamatrix/cu-sparse-matrix.h @@ -121,13 +121,13 @@ class CuSparseMatrix { /// Default constructor CuSparseMatrix() : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { } /// Constructor from CPU-based sparse matrix. explicit CuSparseMatrix(const SparseMatrix &smat) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { this->CopyFromSmat(smat); } @@ -135,7 +135,7 @@ class CuSparseMatrix { /// Constructor from GPU-based sparse matrix (supports transposition). CuSparseMatrix(const CuSparseMatrix &smat, MatrixTransposeType trans = kNoTrans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { this->CopyFromSmat(smat, trans); } @@ -200,19 +200,19 @@ class CuSparseMatrix { /// indices of the first nonzero element in the i-th row, while the last entry /// contains nnz_, as zero-based CSR format is used. const int* CsrRowPtr() const { - return csr_row_ptr_col_idx_; + return csr_row_ptr_; } int* CsrRowPtr() { - return csr_row_ptr_col_idx_; + return csr_row_ptr_; } /// Returns pointer to the integer array of length nnz_ that contains /// the column indices of the corresponding elements in array CsrVal() const int* CsrColIdx() const { - return csr_row_ptr_col_idx_ + num_rows_ + 1; + return csr_col_idx_; } int* CsrColIdx() { - return csr_row_ptr_col_idx_ + num_rows_ + 1; + return csr_col_idx_; } private: @@ -238,9 +238,10 @@ class CuSparseMatrix { // number of non-zeros MatrixIndexT nnz_; - // csr row ptrs and col indices in a single int array - // of the length (num_rows_ + 1 + nnz_) - int* csr_row_ptr_col_idx_; + // length num_rows_ + 1 + int* csr_row_ptr_; + // length nnz_ + int* csr_col_idx_; // csr value array of the length nnz_ Real* csr_val_; diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index 377c34239f0..da19a31b39a 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -19,9 +19,16 @@ // limitations under the License. #if HAVE_CUDA==1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h index 8de46ec46f5..4219467f615 100644 --- a/src/cudamatrix/cu-tp-matrix.h +++ b/src/cudamatrix/cu-tp-matrix.h @@ -48,18 +48,18 @@ class CuTpMatrix : public CuPackedMatrix { CuTpMatrix() : CuPackedMatrix() {} explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero) : CuPackedMatrix(r, resize_type) {} - - explicit CuTpMatrix(const TpMatrix &orig) + + explicit CuTpMatrix(const TpMatrix &orig) : CuPackedMatrix(orig) {} // This constructor lacks the "explicit" keyword so that // we can include this class in std::vector. - CuTpMatrix(const CuTpMatrix &orig) + CuTpMatrix(const CuTpMatrix &orig) : CuPackedMatrix(orig) {} - - explicit CuTpMatrix(const CuMatrixBase &orig, + + explicit CuTpMatrix(const CuMatrixBase &orig, MatrixTransposeType trans = kNoTrans); - + ~CuTpMatrix() {} void CopyFromMat(const CuMatrixBase &M, @@ -70,12 +70,12 @@ class CuTpMatrix : public CuPackedMatrix { } void CopyFromTp(const TpMatrix &other) { CuPackedMatrix::CopyFromPacked(other); - } + } void Cholesky(const CuSpMatrix& Orig); void Invert(); CuTpMatrix &operator = (const CuTpMatrix &in); - + protected: inline const TpMatrix &Mat() const { return *(reinterpret_cast* >(this)); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 8736782a3e0..6667f2bca62 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -22,9 +22,16 @@ // limitations under the License. #if HAVE_CUDA == 1 +#ifdef __IS_HIP_COMPILE__ +#include +#include + +#include "hipify.h" +#else #include #include #endif +#endif #include "base/timer.h" #include "cudamatrix/cu-common.h" @@ -629,7 +636,10 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, N.Data(), N.Stride(), beta, data_); } else { // Case 2: diag(M'*N) == sum(M.*N, 1) - // 16x16 or 8x32 2D block for coalesced memory access. + // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2 + // or + // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE + // 2D block for coalesced memory access. // Grid shape is designed as follows, // 1. for small matrices, use 1D grid with only 1 row of 16x16 block, // to avoid multiple kernel launch; @@ -637,11 +647,12 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // use 1- or 2-D grid so that the grid contains // at least and not much larger than 'kOptNumBlocks' blocks // to fully utilize the GPU; - const int32 warpSize = 32; + const int32 warpSize = GPU_WARP_SIZE; const int32 kOptNumBlocks = 512; const int32 tile_dim = - (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? - 16 : 32; + (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) + ? GPU_WARP_SIZE / 2 + : GPU_WARP_SIZE; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), n_blocks(N.NumRows(), dimBlock.y)); @@ -667,8 +678,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access. // One block per 'tile_dim' columns of N. // 1D grid expands along the row of N. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; + int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 + ? GPU_WARP_SIZE + : GPU_WARP_SIZE / 2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(), @@ -676,8 +688,9 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, } else { // Case 4: diag(M'*N') == sum(N'.*M, 1) // Same kernel and config as case 3 except M and N are swapped. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16; + int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 + ? GPU_WARP_SIZE + : GPU_WARP_SIZE / 2; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); dim3 dimGrid(n_blocks(M.NumCols(), tile_dim)); cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(), diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index f1c32756887..82e1fb47fcb 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -243,7 +243,7 @@ class CuVectorBase { /// Default constructor: make it protected so the user cannot /// instantiate this class. - CuVectorBase(): data_(NULL), dim_(0) { } + CuVectorBase(): data_(NULL), dim_(0) { } Real *data_; ///< GPU data pointer (or regular data pointer ///< if CUDA is not compiled in or we have no GPU). diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h index 63dbe630568..537cca9b97f 100644 --- a/src/cudamatrix/cublas-wrappers.h +++ b/src/cudamatrix/cublas-wrappers.h @@ -28,14 +28,18 @@ namespace kaldi { #if HAVE_CUDA == 1 +#ifndef CUBLAS_R_32F +#define CUBLAS_R_32F CUDA_R_32F +#endif inline cublasStatus_t cublas_gemm( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n,int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) { #if CUDA_VERSION >= 11000 - return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta, - C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(), + return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F, + lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc, + CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); @@ -63,9 +67,11 @@ inline cublasStatus_t cublas_gemmBatched( const float *A[], int lda, const float *B[], int ldb, float beta, float *C[], int ldc, int batchCount) { #if CUDA_VERSION >= 11000 - return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F, lda, - (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount, - CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo()); + return cublasGemmBatchedEx( + handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F, + lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F, + ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(), + CuDevice::Instantiate().GetCublasGemmAlgo()); #else return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); #endif @@ -219,6 +225,7 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, // cuSPARSE wrappers // #if CUDA_VERSION >= 10020 +#ifndef __IS_HIP_COMPILE__ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal, const int *csrRowPtr, @@ -243,6 +250,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, return status; } +#endif inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle, cusparseOperation_t transA, @@ -319,7 +327,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int *cscRowInd, int *cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) { -#if CUDA_VERSION >= 10020 +#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__) return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues, idxBase); @@ -336,7 +344,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n, int *cscRowInd, int *cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) { -#if CUDA_VERSION >= 10020 +#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__) return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues, idxBase); diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc index cc8712e854d..87378f93bbd 100644 --- a/src/decoder/lattice-simple-decoder.cc +++ b/src/decoder/lattice-simple-decoder.cc @@ -45,8 +45,8 @@ void LatticeSimpleDecoder::InitDecoding() { bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) { InitDecoding(); - - while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) { + + while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) { if (NumFramesDecoded() % config_.prune_interval == 0) PruneActiveTokens(config_.lattice_beam * config_.prune_scale); ProcessEmitting(decodable); @@ -57,7 +57,7 @@ bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) { ProcessNonemitting(); } FinalizeDecoding(); - + // Returns true if we have any kind of traceback available (not necessarily // to the end state; query ReachedFinal() for that). return !final_costs_.empty(); @@ -88,9 +88,9 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst, if (decoding_finalized_ && !use_final_probs) KALDI_ERR << "You cannot call FinalizeDecoding() and then call " << "GetRawLattice() with use_final_probs == false"; - + unordered_map final_costs_local; - + const unordered_map &final_costs = (decoding_finalized_ ? final_costs_ : final_costs_local); @@ -100,7 +100,7 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst, ofst->DeleteStates(); int32 num_frames = NumFramesDecoded(); KALDI_ASSERT(num_frames > 0); - const int32 bucket_count = num_toks_/2 + 3; + const int32 bucket_count = num_toks_/2 + 3; unordered_map tok_map(bucket_count); // First create all states. for (int32 f = 0; f <= num_frames; f++) { @@ -169,10 +169,10 @@ bool LatticeSimpleDecoder::GetLattice( fst::ILabelCompare ilabel_comp; ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes // lattice-determinization more efficient. - + fst::DeterminizeLatticePrunedOptions lat_opts; lat_opts.max_mem = config_.det_opts.max_mem; - + DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts); raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed. Connect(ofst); // Remove unreachable states... there might be @@ -196,7 +196,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken( bool emitting, bool *changed) { KALDI_ASSERT(frame < active_toks_.size()); Token *&toks = active_toks_[frame].toks; - + unordered_map::iterator find_iter = cur_toks_.find(state); if (find_iter == cur_toks_.end()) { // no such token presently. // Create one. @@ -221,7 +221,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken( return tok; } } - + // delta is the amount by which the extra_costs must // change before it sets "extra_costs_changed" to true. If delta is larger, // we'll tend to go back less far toward the beginning of the file. @@ -242,7 +242,7 @@ void LatticeSimpleDecoder::PruneForwardLinks( warned_ = true; } } - + bool changed = true; while (changed) { changed = false; @@ -300,7 +300,7 @@ void LatticeSimpleDecoder::ComputeFinalCosts( BaseFloat infinity = std::numeric_limits::infinity(); BaseFloat best_cost = infinity, best_cost_with_final = infinity; - + for (unordered_map::const_iterator iter = cur_toks_.begin(); iter != cur_toks_.end(); ++iter) { StateId state = iter->first; @@ -336,19 +336,19 @@ void LatticeSimpleDecoder::ComputeFinalCosts( // on the final frame. If there are final tokens active, it uses the final-probs // for pruning, otherwise it treats all tokens as final. void LatticeSimpleDecoder::PruneForwardLinksFinal() { - KALDI_ASSERT(!active_toks_.empty()); + KALDI_ASSERT(!active_toks_.empty()); int32 frame_plus_one = active_toks_.size() - 1; if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen. KALDI_WARN << "No tokens alive at end of file\n"; - typedef unordered_map::const_iterator IterType; + typedef unordered_map::const_iterator IterType; ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); decoding_finalized_ = true; // We're about to delete some of the tokens active on the final frame, so we // clear cur_toks_ because otherwise it would then contain dangling pointers. cur_toks_.clear(); - + // Now go through tokens on this frame, pruning forward links... may have to // iterate a few times until there is no more change, because the list is not // in topological order. This is a modified version of the code in @@ -429,7 +429,7 @@ BaseFloat LatticeSimpleDecoder::FinalRelativeCost() const { return final_relative_cost_; } } - + // Prune away any tokens on this frame that have no forward links. [we don't do // this in PruneForwardLinks because it would give us a problem with dangling // pointers]. @@ -453,14 +453,14 @@ void LatticeSimpleDecoder::PruneTokensForFrame(int32 frame) { } } } - + // Go backwards through still-alive tokens, pruning them, starting not from // the current frame (where we want to keep all tokens) but from the frame before // that. We go backwards through the frames and stop when we reach a point // where the delta-costs are not changing (and the delta controls when we consider // a cost to have "not changed"). void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { - int32 cur_frame_plus_one = NumFramesDecoded(); + int32 cur_frame_plus_one = NumFramesDecoded(); int32 num_toks_begin = num_toks_; // The index "f" below represents a "frame plus one", i.e. you'd have to subtract // one to get the corresponding index for the decodable object. @@ -468,7 +468,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { // Reason why we need to prune forward links in this situation: // (1) we have never pruned them // (2) we never pruned the forward links on the next frame, which - // + // if (active_toks_[f].must_prune_forward_links) { bool extra_costs_changed = false, links_pruned = false; PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta); @@ -478,7 +478,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { active_toks_[f].must_prune_tokens = true; active_toks_[f].must_prune_forward_links = false; } - if (f+1 < cur_frame_plus_one && + if (f+1 < cur_frame_plus_one && active_toks_[f+1].must_prune_tokens) { PruneTokensForFrame(f+1); active_toks_[f+1].must_prune_tokens = false; @@ -493,20 +493,20 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) { // (optionally) on the final frame. Takes into account the final-prob of // tokens. This function used to be called PruneActiveTokensFinal(). void LatticeSimpleDecoder::FinalizeDecoding() { - int32 final_frame_plus_one = NumFramesDecoded(); + int32 final_frame_plus_one = NumFramesDecoded(); int32 num_toks_begin = num_toks_; PruneForwardLinksFinal(); - for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { + for (int32 f = final_frame_plus_one - 1; f >= 0; f--) { bool b1, b2; // values not used. BaseFloat dontcare = 0.0; PruneForwardLinks(f, &b1, &b2, dontcare); PruneTokensForFrame(f + 1); } - PruneTokensForFrame(0); + PruneTokensForFrame(0); KALDI_VLOG(3) << "pruned tokens from " << num_toks_begin << " to " << num_toks_; } - + void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) { int32 frame = active_toks_.size() - 1; // frame is the frame-index // (zero-based) used to get likelihoods @@ -538,9 +538,9 @@ void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) { // AddToken adds the next_tok to cur_toks_ (if not already present). Token *next_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost, true, NULL); - + // Add ForwardLink from tok to next_tok (put on head of list tok->links) - tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, + tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, graph_cost, ac_cost, tok->links); } } @@ -553,7 +553,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() { // Note: "frame" is the time-index we just processed, or -1 if // we are processing the nonemitting transitions before the // first frame (called from InitDecoding()). - + // Processes nonemitting arcs for one frame. Propagates within // cur_toks_. Note-- this queue structure is is not very optimal as // it may cause us to process states unnecessarily (e.g. more than once), @@ -569,15 +569,9 @@ void LatticeSimpleDecoder::ProcessNonemitting() { queue.push_back(state); best_cost = std::min(best_cost, iter->second->tot_cost); } - if (queue.empty()) { - if (!warned_) { - KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is " - << frame; - warned_ = true; - } - } + BaseFloat cutoff = best_cost + config_.beam; - + while (!queue.empty()) { StateId state = queue.back(); queue.pop_back(); @@ -600,10 +594,10 @@ void LatticeSimpleDecoder::ProcessNonemitting() { bool changed; Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost, false, &changed); - + tok->links = new ForwardLink(new_tok, 0, arc.olabel, graph_cost, 0, tok->links); - + // "changed" tells us whether the new token has a different // cost from before, or is new [if so, add into queue]. if (changed && fst_.NumInputEpsilons(arc.nextstate) != 0) @@ -662,5 +656,3 @@ void LatticeSimpleDecoder::PruneCurrentTokens(BaseFloat beam, unordered_map fstprinter(*f, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } } @@ -224,7 +226,7 @@ static void TestContextFst(bool verbose, bool use_matcher) { std::cout << "Composed FST is:\n"; { // Try to print the fst. FstPrinter fstprinter(fst_composed, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } } diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc index 5e4f1812930..ae902021c7d 100644 --- a/src/fstext/determinize-lattice-test.cc +++ b/src/fstext/determinize-lattice-test.cc @@ -22,6 +22,8 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { using std::vector; using std::cout; @@ -94,7 +96,7 @@ template void TestDeterminizeLattice() { std::cout << "FST before lattice-determinizing is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst det_fst; try { @@ -106,7 +108,7 @@ template void TestDeterminizeLattice() { std::cout << "FST after lattice-determinizing is:\n"; { FstPrinter fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic); // OK, now determinize it a different way and check equivalence. @@ -117,7 +119,7 @@ template void TestDeterminizeLattice() { std::cout << "Compact FST is:\n"; { FstPrinter fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if (kaldi::Rand() % 2 == 1) ConvertLattice(det_fst, &compact_det_fst, false); @@ -128,7 +130,7 @@ template void TestDeterminizeLattice() { std::cout << "Compact version of determinized FST is:\n"; { FstPrinter fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/)); @@ -149,14 +151,14 @@ template void TestDeterminizeLattice2() { std::cout << "FST before lattice-determinizing is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst; DeterminizeLattice(*fst, &ofst); std::cout << "FST after lattice-determinizing is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } delete fst; } diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h index e9650ca29a6..36c9ba397a6 100644 --- a/src/fstext/determinize-star-inl.h +++ b/src/fstext/determinize-star-inl.h @@ -725,7 +725,7 @@ void DeterminizerStar::EpsilonClosure:: { // this sorting is based on StateId - sort(ecinfo_.begin(), ecinfo_.end()); + std::sort(ecinfo_.begin(), ecinfo_.end()); output_subset->clear(); diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc index 272774b20aa..36568c5bea4 100644 --- a/src/fstext/determinize-star-test.cc +++ b/src/fstext/determinize-star-test.cc @@ -24,6 +24,7 @@ #include "fstext/trivial-factor-weight.h" #include "fstext/fst-test-utils.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -38,7 +39,7 @@ template void TestDeterminizeGeneral() { std::cout << "FST before determinizing is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst; try { @@ -46,7 +47,7 @@ template void TestDeterminizeGeneral() { std::cout << "FST after determinizing is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/)); } catch (...) { @@ -101,7 +102,7 @@ template void TestDeterminize() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -109,7 +110,7 @@ template void TestDeterminize() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -122,7 +123,7 @@ template void TestDeterminize() { std::cout <<" printing after predeterminization\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -138,7 +139,7 @@ template void TestDeterminize() { std::cout <<" printing after epsilon removal\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst_orig; VectorFst ofst_star; @@ -157,14 +158,14 @@ template void TestDeterminize() { { std::cout <<" printing after determinization [baseline]\n"; FstPrinter fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic); } { std::cout <<" printing after determinization [star]\n"; FstPrinter fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic); } @@ -174,7 +175,7 @@ template void TestDeterminize() { std::cout <<" printing after removing "< fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Checking equivalent to original FST.\n"; @@ -242,7 +243,7 @@ template void TestPush() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -250,7 +251,7 @@ template void TestPush() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -267,7 +268,7 @@ template void TestPush() { std::cout <<" printing after pushing\n"; { FstPrinter fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(*fst, fst_pushed, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/)); @@ -320,7 +321,7 @@ template void TestMinimize() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -328,7 +329,7 @@ template void TestMinimize() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -341,7 +342,7 @@ template void TestMinimize() { std::cout <<" printing after predeterminization\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -357,7 +358,7 @@ template void TestMinimize() { std::cout <<" printing after epsilon removal\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst_orig; VectorFst ofst_star; @@ -370,7 +371,7 @@ template void TestMinimize() { { std::cout <<" printing after determinization [baseline]\n"; FstPrinter fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -382,7 +383,7 @@ template void TestMinimize() { { std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n"; FstPrinter > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -392,7 +393,8 @@ template void TestMinimize() { { std::cout <<" printing after pushing weights [in gallic]\n"; FstPrinter > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -401,7 +403,7 @@ template void TestMinimize() { { std::cout <<" printing after minimization [in gallic]\n"; FstPrinter > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } printf("Converting gallic back to regular [my approach]\n"); @@ -410,7 +412,7 @@ template void TestMinimize() { { std::cout <<" printing factor-weight FST\n"; FstPrinter > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } Map(fwfst, &ofst_star, FromGallicMapper()); @@ -418,7 +420,7 @@ template void TestMinimize() { { std::cout <<" printing after converting back to regular FST\n"; FstPrinter fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } } @@ -431,7 +433,7 @@ template void TestMinimize() { std::cout <<" printing after removing "< fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Checking equivalent to original FST.\n"; diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc index 9f13b8b9695..d58dbfa539c 100644 --- a/src/fstext/factor-test.cc +++ b/src/fstext/factor-test.cc @@ -23,6 +23,7 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -79,7 +80,7 @@ template static void TestFactor() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(&fst); @@ -87,7 +88,7 @@ template static void TestFactor() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if (fst.Start() == kNoStateId) return; // "Connect" made it empty. diff --git a/src/fstext/fstext-lib.h b/src/fstext/fstext-lib.h index bdb8ff730e5..03c8e5861dd 100644 --- a/src/fstext/fstext-lib.h +++ b/src/fstext/fstext-lib.h @@ -20,6 +20,9 @@ #ifndef KALDI_FSTEXT_FSTEXT_LIB_H_ #define KALDI_FSTEXT_FSTEXT_LIB_H_ #include "fst/fstlib.h" + +#include "fstext/openfst_compat.h" + #include "fstext/context-fst.h" #include "fstext/determinize-star.h" #include "fstext/factor.h" diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h index 148c64f3699..fb3a637bc19 100644 --- a/src/fstext/fstext-utils-inl.h +++ b/src/fstext/fstext-utils-inl.h @@ -163,7 +163,7 @@ void RemoveSomeInputSymbols(const std::vector &to_remove, MutableFst *fst) { KALDI_ASSERT_IS_INTEGER_TYPE(I); RemoveSomeInputSymbolsMapper mapper(to_remove); - Map(fst, mapper); + ArcMap(fst, mapper); } template @@ -374,12 +374,25 @@ void GetSymbols(const SymbolTable &symtab, std::vector *syms_out) { KALDI_ASSERT(syms_out != NULL); syms_out->clear(); - for (const SymbolTable::iterator::value_type &sym : symtab) { - if (include_eps || sym.Label() != 0) { - syms_out->push_back(sym.Label()); - KALDI_ASSERT(syms_out->back() == sym.Label()); // an integer-range thing. +#if OPENFST_VER >= 10800 + for (SymbolTable::iterator iter = symtab.begin(); + iter != symtab.end(); + ++iter) { + if (include_eps || iter->Label() != 0) { + syms_out->push_back(iter->Label()); + KALDI_ASSERT(syms_out->back() == iter->Label()); // an integer-range thing. + } + } +#else + for (SymbolTableIterator iter(symtab); + !iter.Done(); + iter.Next()) { + if (include_eps || iter.Value() != 0) { + syms_out->push_back(iter.Value()); + KALDI_ASSERT(syms_out->back() == iter.Value()); // an integer-range thing. } } +#endif } template diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc index 38ecc35dae2..460e49c7dec 100644 --- a/src/fstext/fstext-utils-test.cc +++ b/src/fstext/fstext-utils-test.cc @@ -23,6 +23,8 @@ #include "util/stl-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { using std::vector; @@ -140,7 +142,7 @@ template void TestSafeDeterminizeWrapper() { // also tests SafeDete std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -148,7 +150,7 @@ template void TestSafeDeterminizeWrapper() { // also tests SafeDete std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -362,7 +364,7 @@ void TestEqualAlign() { template void Print(const Fst &fst, std::string message) { std::cout << message << "\n"; FstPrinter fstprinter(fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h index 5789dbe7cc3..db14ddd3576 100644 --- a/src/fstext/fstext-utils.h +++ b/src/fstext/fstext-utils.h @@ -113,7 +113,7 @@ void PushInLog(VectorFst *fst, uint32 ptype, float delta = kDelta) { template void MinimizeEncoded(VectorFst *fst, float delta = kDelta) { - Map(fst, QuantizeMapper(delta)); + ArcMap(fst, QuantizeMapper(delta)); EncodeMapper encoder(kEncodeLabels | kEncodeWeights, ENCODE); Encode(fst, &encoder); internal::AcceptorMinimize(fst); diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h index 01047919c22..3baa5b95c9c 100644 --- a/src/fstext/kaldi-fst-io-inl.h +++ b/src/fstext/kaldi-fst-io-inl.h @@ -24,6 +24,8 @@ #include "util/text-utils.h" +#include "fstext/openfst_compat.h" + namespace fst { @@ -44,7 +46,8 @@ void WriteFstKaldi(std::ostream &os, bool binary, bool acceptor = false, write_one = false; FstPrinter printer(t, t.InputSymbols(), t.OutputSymbols(), NULL, acceptor, write_one, "\t"); - printer.Print(os, ""); + //printer.Print(&os, ""); + printer_print(os, printer, ""); if (os.fail()) KALDI_ERR << "Stream failure detected writing FST to stream"; // Write another newline as a terminating character. The read routine will diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h index a45920936ec..3c34f4b4787 100644 --- a/src/fstext/kaldi-fst-io.h +++ b/src/fstext/kaldi-fst-io.h @@ -26,6 +26,7 @@ #include #include #include "base/kaldi-common.h" +#include "fstext/openfst_compat.h" // Some functions for writing Fsts. // I/O for FSTs is a bit of a mess, and not very well integrated with Kaldi's diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h index c97a538dd1d..03ac9947c5c 100644 --- a/src/fstext/lattice-utils-inl.h +++ b/src/fstext/lattice-utils-inl.h @@ -268,9 +268,9 @@ void ConvertFstToLattice( MutableFst > > *ofst) { int32 num_states_cache = 50000; fst::CacheOptions cache_opts(true, num_states_cache); - fst::MapFstOptions mapfst_opts(cache_opts); + fst::ArcMapFstOptions mapfst_opts(cache_opts); StdToLatticeMapper mapper; - MapFst >, + ArcMapFst >, StdToLatticeMapper > map_fst(ifst, mapper, mapfst_opts); *ofst = map_fst; } diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc index 8b596fcc356..6f1d2747cc1 100644 --- a/src/fstext/lattice-utils-test.cc +++ b/src/fstext/lattice-utils-test.cc @@ -21,6 +21,8 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { template void TestConvert(bool invert) { @@ -31,7 +33,7 @@ template void TestConvert(bool invert) { std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst ofst; ConvertLattice(*fst, &ofst, invert); @@ -39,14 +41,14 @@ template void TestConvert(bool invert) { std::cout << "FST after converting is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst origfst; ConvertLattice(ofst, &origfst, invert); std::cout << "FST after back conversion is:\n"; { FstPrinter fstprinter(origfst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/)); @@ -67,7 +69,7 @@ template void TestShortestPath() { std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst cfst; ConvertLattice(*fst, &cfst, false); // invert == false @@ -205,7 +207,7 @@ template void TestConvertPair(bool invert) { /*std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst ofst; ConvertLattice(*fst, &ofst, invert); @@ -213,14 +215,14 @@ template void TestConvertPair(bool invert) { /*std::cout << "FST after converting is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst origfst; ConvertLattice(ofst, &origfst, invert); /*std::cout << "FST after back conversion is:\n"; { FstPrinter fstprinter(origfst, NULL, NULL, NULL, false, true); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/)); @@ -260,7 +262,7 @@ template void TestScalePair(bool invert) { /*std::cout << "FST before converting to compact-arc is:\n"; { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst ofst; ConvertLattice(*fst, &ofst, invert); @@ -268,7 +270,7 @@ template void TestScalePair(bool invert) { /*std::cout << "FST after converting and scaling is:\n"; { FstPrinter fstprinter(ofst, NULL, NULL, NULL, false, true); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ VectorFst origfst; ConvertLattice(ofst, &origfst, invert); @@ -276,7 +278,7 @@ template void TestScalePair(bool invert) { /*std::cout << "FST after back conversion and scaling is:\n"; { FstPrinter fstprinter(origfst, NULL, NULL, NULL, false, true); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); }*/ // If RandEquivalent doesn't work, it could be due to a nasty issue related to the use // of exact floating-point comparisons in the Plus function of LatticeWeight. diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h index ab33a7f243a..1458b1c9891 100644 --- a/src/fstext/lattice-weight.h +++ b/src/fstext/lattice-weight.h @@ -23,6 +23,7 @@ #include "fst/fstlib.h" #include "base/kaldi-common.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -438,11 +439,9 @@ class CompactLatticeWeightTpl { CompactLatticeWeightTpl(const WeightType &w, const std::vector &s): weight_(w), string_(s) { } - CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) { - weight_ = w.weight_; - string_ = w.string_; - return *this; - } + CompactLatticeWeightTpl(const CompactLatticeWeightTpl &compactLatticeWeightTpl) = default; + + CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) = default; const W &Weight() const { return weight_; } diff --git a/src/fstext/openfst_compat.h b/src/fstext/openfst_compat.h new file mode 100644 index 00000000000..251d3f893c5 --- /dev/null +++ b/src/fstext/openfst_compat.h @@ -0,0 +1,42 @@ +#ifndef KALDI_FSTEXT_OPENFST_COMPAT_H +#define KALDI_FSTEXT_OPENFST_COMPAT_H + + +#if OPENFST_VER < 10800 +#define FST_FLAGS_fst_weight_separator FLAGS_fst_weight_separator +#define FST_FLAGS_fst_field_separator FLAGS_fst_field_separator +#define FST_FLAGS_v FLAGS_v + +#endif + +namespace fst { +#if OPENFST_VER >= 10800 + + +template +auto Map(Args&&... args) -> decltype(ArcMap(std::forward(args)...)) { + return ArcMap(std::forward(args)...); +} + +using MapFstOptions=ArcMapFstOptions; + +template +using MapFst = ArcMapFst; + +template +void printer_print(Stream &os, Printer &printer, const std::string &s) { + printer.Print(os, s); +} + +#else + +template +void printer_print(Stream &os, Printer &printer, const std::string &s) { + printer.Print(&os, s); +} + +#endif + +} // namespace fst + +#endif //KALDI_FSTEXT_OPENFST_COMPAT_H diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h index 4ce58c97abc..45e1a82279a 100644 --- a/src/fstext/pre-determinize-inl.h +++ b/src/fstext/pre-determinize-inl.h @@ -235,9 +235,14 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix, assert(symTable != NULL); const char *prefix_ptr = prefix.c_str(); size_t prefix_len = strlen(prefix_ptr); // allowed to be zero but not encouraged. - for (const SymbolTable::iterator::value_type &symbol : *symTable) { - const char *sym = symbol.Symbol().c_str(); - if (!strncmp(prefix_ptr, sym, prefix_len)) { // has prefix. +#if OPENFST_VER >= 10800 + for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) { + const std::string &sym = siter->Symbol(); +#else + for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) { + const std::string &sym = siter.Symbol(); +#endif + if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) { // has prefix. if (isdigit(sym[prefix_len])) { // we don't allow prefix followed by a digit, as a symbol. // Has at least one digit. size_t pos; @@ -411,8 +416,6 @@ void PreDeterminize(MutableFst *fst, std::vector d_vec(max_state+1, false); // "done vector". Purely for debugging. - size_t num_extra_det_states = 0; - // (D)(v) while (Q.size() != 0) { @@ -491,7 +494,6 @@ void PreDeterminize(MutableFst *fst, assert(m_map.count(this_pr.first) == 0); m_map[this_pr.first] = k; k++; - num_extra_det_states++; } } else { // Create the set V_t. V_t.insert(this_pr.second); @@ -689,11 +691,9 @@ typename Arc::StateId CreateSuperFinal(MutableFst *fst) { typedef typename Arc::Weight Weight; assert(fst != NULL); StateId num_states = fst->NumStates(); - StateId num_final = 0; std::vector final_states; for (StateId s = 0; s < num_states; s++) { if (fst->Final(s) != Weight::Zero()) { - num_final++; final_states.push_back(s); } } diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc index 95ebd62f04f..60953e40b8d 100644 --- a/src/fstext/pre-determinize-test.cc +++ b/src/fstext/pre-determinize-test.cc @@ -22,8 +22,7 @@ #include "fstext/fst-test-utils.h" #include "fstext/fstext-utils.h" -// Just check that it compiles, for now. - +#include "fstext/openfst_compat.h" namespace fst { using std::vector; @@ -73,7 +72,7 @@ template void TestPreDeterminize() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -81,7 +80,7 @@ template void TestPreDeterminize() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst *fst_copy_orig = new VectorFst(*fst); @@ -95,7 +94,7 @@ template void TestPreDeterminize() { std::cout <<" printing after predeterminization\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -111,7 +110,7 @@ template void TestPreDeterminize() { std::cout <<" printing after epsilon removal\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -121,14 +120,14 @@ template void TestPreDeterminize() { std::cout <<" printing after determinization\n"; { FstPrinter fstprinter(ofst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } int64 num_removed = DeleteISymbols(&ofst, extra_syms); std::cout <<" printing after removing "< fstprinter(ofst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Checking equivalent to original FST.\n"; @@ -180,7 +179,7 @@ template void TestAddSelfLoops() { std::cout <<" printing before adding self-loops\n"; { FstPrinter fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -199,7 +198,7 @@ template void TestAddSelfLoops() { std::cout <<" printing after adding self-loops\n"; { FstPrinter fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } delete fst; diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc index f27b54f4587..f91001fca0d 100644 --- a/src/fstext/prune-special-test.cc +++ b/src/fstext/prune-special-test.cc @@ -22,6 +22,8 @@ #include "fstext/rand-fst.h" #include "fstext/fstext-utils.h" +#include "fstext/openfst_compat.h" + namespace fst { static void TestPruneSpecial() { @@ -38,7 +40,7 @@ static void TestPruneSpecial() { { FstPrinter fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); std::cout << std::endl; } @@ -47,7 +49,7 @@ static void TestPruneSpecial() { PruneSpecial(*ifst, &ofst1, beam); { FstPrinter fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); std::cout << std::endl; } @@ -56,7 +58,7 @@ static void TestPruneSpecial() { Prune(*ifst, &ofst2, beam); { FstPrinter fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); std::cout << std::endl; } diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc index 9cf16bb8a84..9fe8ba63b59 100644 --- a/src/fstext/push-special-test.cc +++ b/src/fstext/push-special-test.cc @@ -23,6 +23,8 @@ #include "fstext/fstext-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst { @@ -38,7 +40,7 @@ static void TestPushSpecial() { { FstPrinter fstprinter(*fst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst fst_copy(*fst); @@ -56,7 +58,7 @@ static void TestPushSpecial() { { FstPrinter fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value(); diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc index 2e1d3d8cfa1..1548ac5c726 100644 --- a/src/fstext/remove-eps-local-test.cc +++ b/src/fstext/remove-eps-local-test.cc @@ -23,6 +23,7 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" namespace fst { @@ -83,7 +84,7 @@ template static void TestRemoveEpsLocal() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst fst_copy1(fst); @@ -96,7 +97,7 @@ template static void TestRemoveEpsLocal() { { std::cout << "copy1 = \n"; FstPrinter fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -141,7 +142,7 @@ static void TestRemoveEpsLocalSpecial() { { std::cout << "logfst = \n"; FstPrinter fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } VectorFst fst; @@ -156,7 +157,7 @@ static void TestRemoveEpsLocalSpecial() { { std::cout << "logfst2 = \n"; FstPrinter fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) { // make sure we preserved stochasticity in cases where doing so was diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc index 0e8982720d4..1cc8bd02bef 100644 --- a/src/fstext/table-matcher-test.cc +++ b/src/fstext/table-matcher-test.cc @@ -21,6 +21,8 @@ #include "fstext/fst-test-utils.h" #include "base/kaldi-math.h" +#include "fstext/openfst_compat.h" + namespace fst{ @@ -64,13 +66,13 @@ template void TestTableMatcher(bool connect, bool left) { std::cout <<"Table-Composed FST\n"; { FstPrinter fstprinter(composed, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } std::cout <<" Baseline-Composed FST\n"; { FstPrinter fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } if ( !RandEquivalent(composed, composed_baseline, 3/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 20/*path length-- max?*/)) { @@ -79,7 +81,7 @@ template void TestTableMatcher(bool connect, bool left) { std::cout <<" Diff1 (composed - baseline) \n"; { FstPrinter fstprinter(diff1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -88,7 +90,7 @@ template void TestTableMatcher(bool connect, bool left) { std::cout <<" Diff2 (baseline - composed) \n"; { FstPrinter fstprinter(diff2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(0); @@ -149,7 +151,7 @@ template void TestTableMatcherCacheLeft(bool connect) { std::cout <<" Diff1 (composed - baseline) \n"; { FstPrinter fstprinter(diff1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -158,7 +160,7 @@ template void TestTableMatcherCacheLeft(bool connect) { std::cout <<" Diff2 (baseline - composed) \n"; { FstPrinter fstprinter(diff2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(0); @@ -219,7 +221,7 @@ template void TestTableMatcherCacheRight(bool connect) { std::cout <<" Diff1 (composed - baseline) \n"; { FstPrinter fstprinter(diff1, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } @@ -228,7 +230,7 @@ template void TestTableMatcherCacheRight(bool connect) { std::cout <<" Diff2 (baseline - composed) \n"; { FstPrinter fstprinter(diff2, NULL, NULL, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } assert(0); diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h index 290a4f8bc2e..9e921920c48 100644 --- a/src/fstext/table-matcher.h +++ b/src/fstext/table-matcher.h @@ -22,7 +22,7 @@ #include #include - +#include "base/kaldi-types.h" namespace fst { diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc index 3045a669362..556d194a60d 100644 --- a/src/fstext/trivial-factor-weight-test.cc +++ b/src/fstext/trivial-factor-weight-test.cc @@ -22,7 +22,8 @@ #include "fstext/determinize-star.h" #include "fstext/trivial-factor-weight.h" #include "fstext/fst-test-utils.h" -// Just check that it compiles, for now. + +#include "fstext/openfst_compat.h" namespace fst { @@ -73,7 +74,7 @@ template void TestFactor() { std::cout <<" printing before trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } // Trim resulting FST. Connect(fst); @@ -81,7 +82,7 @@ template void TestFactor() { std::cout <<" printing after trimming\n"; { FstPrinter fstprinter(*fst, sptr, sptr, NULL, false, true, "\t"); - fstprinter.Print(std::cout, "standard output"); + printer_print(std::cout, fstprinter, "standard output"); } vector