diff --git a/.cardboardlint.yml b/.cardboardlint.yml deleted file mode 100644 index 4a115a37cd..0000000000 --- a/.cardboardlint.yml +++ /dev/null @@ -1,5 +0,0 @@ -linters: -- pylint: - # pylintrc: pylintrc - filefilter: ['- test_*.py', '+ *.py', '- *.npy'] - # exclude: \ No newline at end of file diff --git a/.dockerignore b/.dockerignore index 8d8ad918c9..5b28aa99dc 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,4 +6,4 @@ TTS.egg-info/ tests/outputs/* tests/train_outputs/* __pycache__/ -*.pyc \ No newline at end of file +*.pyc diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml index 34cde7e844..6a50c24562 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -59,7 +59,7 @@ body: You can either run `TTS/bin/collect_env_info.py` ```bash - wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py + wget https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/TTS/bin/collect_env_info.py python collect_env_info.py ``` diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 05ca7db6bd..ccaaff7565 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,8 @@ blank_issues_enabled: false contact_links: - name: CoquiTTS GitHub Discussions - url: https://github.com/coqui-ai/TTS/discussions + url: https://github.com/idiap/coqui-ai-TTS/discussions about: Please ask and answer questions here. - name: Coqui Security issue disclosure - url: mailto:info@coqui.ai + url: mailto:enno.hermann@gmail.com about: Please report security vulnerabilities here. diff --git a/.github/PR_TEMPLATE.md b/.github/PR_TEMPLATE.md index 330109c3bc..9e7605a4ef 100644 --- a/.github/PR_TEMPLATE.md +++ b/.github/PR_TEMPLATE.md @@ -5,11 +5,3 @@ Welcome to the 🐸TTS project! We are excited to see your interest, and appreci This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file. In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file. - -Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS). - -This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS): - -- Protects you, Coqui, and the users of the code. -- Does not change your rights to use your contributions for any purpose. -- Does not change the license of the 🐸TTS project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute. diff --git a/.github/stale.yml b/.github/stale.yml index e05eaf0b57..dd45bf098f 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -15,4 +15,3 @@ markComment: > for your contributions. You might also look our discussion channels. # Comment to post when closing a stale issue. Set to `false` to disable closeComment: false - diff --git a/.github/workflows/aux_tests.yml b/.github/workflows/aux_tests.yml deleted file mode 100644 index f4cb3ecfe1..0000000000 --- a/.github/workflows/aux_tests.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: aux-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_aux diff --git a/.github/workflows/data_tests.yml b/.github/workflows/data_tests.yml deleted file mode 100644 index 3d1e3f8c4d..0000000000 --- a/.github/workflows/data_tests.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: data-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make data_tests diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 1f15159b42..249816a320 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -10,7 +10,7 @@ on: jobs: docker-build: name: "Build and push Docker image" - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: matrix: arch: ["amd64"] @@ -18,7 +18,7 @@ jobs: - "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled - "python:3.10.8-slim" # CPU only steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Log in to the Container registry uses: docker/login-action@v1 with: @@ -29,11 +29,11 @@ jobs: id: compute-tag run: | set -ex - base="ghcr.io/coqui-ai/tts" + base="ghcr.io/idiap/coqui-tts" tags="" # PR build if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then - base="ghcr.io/coqui-ai/tts-cpu" + base="ghcr.io/idiap/coqui-tts-cpu" fi if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then @@ -42,7 +42,7 @@ jobs: branch=${github_ref#*refs/heads/} # strip prefix to get branch name tags="${base}:${branch},${base}:${{ github.sha }}," elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then - VERSION="v$(cat TTS/VERSION)" + VERSION="v$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)" if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then echo "Pushed tag does not match VERSION file. Aborting push." exit 1 @@ -63,3 +63,58 @@ jobs: push: ${{ github.event_name == 'push' }} build-args: "BASE=${{ matrix.base }}" tags: ${{ steps.compute-tag.outputs.tags }} + docker-dev-build: + name: "Build the development Docker image" + runs-on: ubuntu-latest + strategy: + matrix: + arch: ["amd64"] + base: + - "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled + steps: + - uses: actions/checkout@v4 + - name: Log in to the Container registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Compute Docker tags, check VERSION file matches tag + id: compute-tag + run: | + set -ex + base="ghcr.io/idiap/coqui-tts-dev" + tags="" # PR build + + if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then + base="ghcr.io/idiap/coqui-tts-dev-cpu" + fi + + if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then + # Push to branch + github_ref="${{ github.ref }}" + branch=${github_ref#*refs/heads/} # strip prefix to get branch name + tags="${base}:${branch},${base}:${{ github.sha }}," + elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then + VERSION="v$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)" + if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then + echo "Pushed tag does not match VERSION file. Aborting push." + exit 1 + fi + tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}" + fi + echo "::set-output name=tags::${tags}" + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + file: dockerfiles/Dockerfile.dev + platforms: linux/${{ matrix.arch }} + push: false + build-args: "BASE=${{ matrix.base }}" + tags: ${{ steps.compute-tag.outputs.tags }} diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml deleted file mode 100644 index d2159027b6..0000000000 --- a/.github/workflows/inference_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: inference_tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: | - export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make inference_tests diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 2bbcf3cd70..efe4bf71d4 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -8,18 +8,18 @@ defaults: bash jobs: build-sdist: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Verify tag matches version run: | set -ex - version=$(cat TTS/VERSION) + version=$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o) tag="${GITHUB_REF/refs\/tags\/}" if [[ "v$version" != "$tag" ]]; then exit 1 fi - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: 3.9 - run: | @@ -28,67 +28,63 @@ jobs: python -m build - run: | pip install dist/*.tar.gz - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: name: sdist path: dist/*.tar.gz build-wheels: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install pip requirements + - name: Install build requirements run: | - python -m pip install -U pip setuptools wheel build - python -m pip install -r requirements.txt + python -m pip install -U pip setuptools wheel build numpy cython - name: Setup and install manylinux1_x86_64 wheel run: | python setup.py bdist_wheel --plat-name=manylinux1_x86_64 python -m pip install dist/*-manylinux*.whl - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: name: wheel-${{ matrix.python-version }} path: dist/*-manylinux*.whl publish-artifacts: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest needs: [build-sdist, build-wheels] + environment: + name: release + url: https://pypi.org/p/coqui-tts + permissions: + id-token: write steps: - run: | mkdir dist - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "sdist" path: "dist/" - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "wheel-3.9" path: "dist/" - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "wheel-3.10" path: "dist/" - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "wheel-3.11" path: "dist/" - - run: | - ls -lh dist/ - - name: Setup PyPI config - run: | - cat << EOF > ~/.pypirc - [pypi] - username=__token__ - password=${{ secrets.PYPI_TOKEN }} - EOF - - uses: actions/setup-python@v2 + - uses: actions/download-artifact@v4 with: - python-version: 3.9 - - run: | - python -m pip install twine + name: "wheel-3.12" + path: "dist/" - run: | - twine upload --repository pypi dist/* + ls -lh dist/ + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index b7c6393baa..c913c233d8 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -7,12 +7,6 @@ on: pull_request: types: [opened, synchronize, reopened] jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - test: runs-on: ubuntu-latest strategy: @@ -21,26 +15,15 @@ jobs: python-version: [3.9] experimental: [false] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} architecture: x64 cache: 'pip' cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Style check - run: make style + - name: Install/upgrade dev dependencies + run: python3 -m pip install -r requirements.dev.txt + - name: Lint check + run: make lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000..88cc8e7949 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,81 @@ +name: tests + +on: + push: + branches: + - main + pull_request: + types: [opened, synchronize, reopened] +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.9, "3.10", "3.11", "3.12"] + subset: ["data_tests", "inference_tests", "test_aux", "test_text", "test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: 'requirements*' + - name: check OS + run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 + - name: Install Espeak + if: contains(fromJSON('["inference_tests", "test_text", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install/upgrade Python setup deps + run: python3 -m pip install --upgrade pip setuptools wheel uv + - name: Replace scarf urls + if: contains(fromJSON('["data_tests", "inference_tests", "test_aux", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + run: | + sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json + - name: Install TTS + run: | + resolution=highest + if [ "${{ matrix.python-version }}" == "3.9" ]; then + resolution=lowest-direct + fi + python3 -m uv pip install --resolution=$resolution --system "coqui-tts[dev,server,languages] @ ." + - name: Unit tests + run: make ${{ matrix.subset }} + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} + path: .coverage.* + if-no-files-found: ignore + coverage: + if: always() + needs: test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - uses: actions/download-artifact@v4 + with: + pattern: coverage-data-* + merge-multiple: true + - name: Combine coverage + run: | + python -Im pip install --upgrade coverage[toml] + + python -Im coverage combine + python -Im coverage html --skip-covered --skip-empty + + python -Im coverage report --format=markdown >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml deleted file mode 100644 index 78d3026d7f..0000000000 --- a/.github/workflows/text_tests.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: text-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_text diff --git a/.github/workflows/tts_tests.yml b/.github/workflows/tts_tests.yml deleted file mode 100644 index 5074cded6d..0000000000 --- a/.github/workflows/tts_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: tts-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_tts diff --git a/.github/workflows/tts_tests2.yml b/.github/workflows/tts_tests2.yml deleted file mode 100644 index f64433f8df..0000000000 --- a/.github/workflows/tts_tests2.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: tts-tests2 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_tts2 diff --git a/.github/workflows/vocoder_tests.yml b/.github/workflows/vocoder_tests.yml deleted file mode 100644 index 6519ee3fef..0000000000 --- a/.github/workflows/vocoder_tests.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: vocoder-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_vocoder diff --git a/.github/workflows/xtts_tests.yml b/.github/workflows/xtts_tests.yml deleted file mode 100644 index be367f3547..0000000000 --- a/.github/workflows/xtts_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: xtts-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_xtts diff --git a/.github/workflows/zoo_tests0.yml b/.github/workflows/zoo_tests0.yml deleted file mode 100644 index 13f47a938b..0000000000 --- a/.github/workflows/zoo_tests0.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: zoo-tests-0 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: | - nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3 - nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion diff --git a/.github/workflows/zoo_tests1.yml b/.github/workflows/zoo_tests1.yml deleted file mode 100644 index 00f13397fa..0000000000 --- a/.github/workflows/zoo_tests1.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: zoo-tests-1 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3 diff --git a/.github/workflows/zoo_tests2.yml b/.github/workflows/zoo_tests2.yml deleted file mode 100644 index 310a831a8b..0000000000 --- a/.github/workflows/zoo_tests2.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: zoo-tests-2 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3 diff --git a/.gitignore b/.gitignore index 22ec6e410a..f9708961e2 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,4 @@ wandb depot/* coqui_recipes/* local_scripts/* -coqui_demos/* \ No newline at end of file +coqui_demos/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 911f2a838e..f96f6f38ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,27 +1,24 @@ repos: - - repo: 'https://github.com/pre-commit/pre-commit-hooks' - rev: v2.3.0 + - repo: "https://github.com/pre-commit/pre-commit-hooks" + rev: v4.5.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - - repo: 'https://github.com/psf/black' - rev: 22.3.0 + - repo: "https://github.com/psf/black" + rev: 24.2.0 hooks: - id: black language_version: python3 - - repo: https://github.com/pycqa/isort - rev: 5.8.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.0 hooks: - - id: isort - name: isort (python) - - id: isort - name: isort (cython) - types: [cython] - - id: isort - name: isort (pyi) - types: [pyi] - - repo: https://github.com/pycqa/pylint - rev: v2.8.2 + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - repo: local hooks: - - id: pylint + - id: generate_requirements.py + name: generate_requirements.py + language: system + entry: python scripts/generate_requirements.py + files: "pyproject.toml|requirements.*\\.txt|tools/generate_requirements.py" diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 49a9dbdd2c..0000000000 --- a/.pylintrc +++ /dev/null @@ -1,599 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. -jobs=1 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=missing-docstring, - too-many-public-methods, - too-many-lines, - bare-except, - ## for avoiding weird p3.6 CI linter error - ## TODO: see later if we can remove this - assigning-non-slot, - unsupported-assignment-operation, - ## end - line-too-long, - fixme, - wrong-import-order, - ungrouped-imports, - wrong-import-position, - import-error, - invalid-name, - too-many-instance-attributes, - arguments-differ, - arguments-renamed, - no-name-in-module, - no-member, - unsubscriptable-object, - print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - useless-object-inheritance, - too-few-public-methods, - too-many-branches, - too-many-arguments, - too-many-locals, - too-many-statements, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - duplicate-code, - not-callable, - import-outside-toplevel, - logging-fstring-interpolation, - logging-not-lazy - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details. -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio). You can also give a reporter class, e.g. -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages. -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit - - -[LOGGING] - -# Format style used to check logging format string. `old` means using % -# formatting, while `new` is for `{}` formatting. -logging-format-style=old - -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging - - -[SPELLING] - -# Limits count of emitted suggestions for spelling mistakes. -max-spelling-suggestions=4 - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package.. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX, - TODO - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members=numpy.*,torch.* - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expected to -# not be used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=120 - -# Maximum number of lines in a module. -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=4 - - -[BASIC] - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style. -argument-rgx=[a-z_][a-z0-9_]{0,30}$ - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style. -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma. -bad-names= - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. -#class-attribute-rgx= - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming- -# style. -#class-rgx= - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style. -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style. -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma. -good-names=i, - j, - k, - x, - ex, - Run, - _ - -# Include a hint for the correct naming format with invalid-name. -include-naming-hint=no - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. -#inlinevar-rgx= - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style. -#method-rgx= - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style. -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -# These decorators are taken in consideration only for invalid-name. -property-classes=abc.abstractproperty - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style. -variable-rgx=[a-z_][a-z0-9_]{0,30}$ - - -[STRING] - -# This flag controls whether the implicit-str-concat-in-sequence should -# generate a warning on implicit string concatenation in sequences defined over -# several lines. -check-str-concat-over-line-jumps=no - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled). -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled). -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement. -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=15 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception diff --git a/.readthedocs.yml b/.readthedocs.yml index 266a2cdeb2..e19a4dccb7 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -14,8 +14,9 @@ build: # Optionally set the version of Python and requirements required to build your docs python: install: - - requirements: docs/requirements.txt - - requirements: requirements.txt + - path: . + extra_requirements: + - docs # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/CITATION.cff b/CITATION.cff index 6b0c8f19af..0be0d75d78 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -10,11 +10,11 @@ authors: version: 1.4 doi: 10.5281/zenodo.6334862 license: "MPL-2.0" -url: "https://www.coqui.ai" -repository-code: "https://github.com/coqui-ai/TTS" +url: "https://github.com/idiap/coqui-ai-TTS" +repository-code: "https://github.com/idiap/coqui-ai-TTS" keywords: - machine learning - deep learning - artificial intelligence - text to speech - - TTS \ No newline at end of file + - TTS diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index b80639d63c..9c83ebcf12 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -119,11 +119,11 @@ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. -Community Impact Guidelines were inspired by +Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. For answers to common questions about this code of conduct, see the FAQ at -[https://www.contributor-covenant.org/faq][FAQ]. Translations are available +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at [https://www.contributor-covenant.org/translations][translations]. [homepage]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ae0ce46048..e93858f27d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Welcome to the 🐸TTS! -This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md). +This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/idiap/coqui-ai-TTS/blob/main/CODE_OF_CONDUCT.md). ## Where to start. We welcome everyone who likes to contribute to 🐸TTS. @@ -15,13 +15,13 @@ If you like to contribute code, squash a bug but if you don't know where to star You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc. -- [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues) +- [Github Issues Tracker](https://github.com/idiap/coqui-ai-TTS/issues) This is a place to find feature requests, bugs. Issues with the ```good first issue``` tag are good place for beginners to take on. -- ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag. +- ✨**PR**✨ [pages](https://github.com/idiap/coqui-ai-TTS/pulls) with the ```🚀new version``` tag. We list all the target improvements for the next version. You can pick one of them and start contributing. @@ -46,21 +46,21 @@ Let us know if you encounter a problem along the way. The following steps are tested on an Ubuntu system. -1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page. +1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page. 2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```. ```bash - $ git clone git@github.com:/TTS.git - $ cd TTS - $ git remote add upstream https://github.com/coqui-ai/TTS.git + $ git clone git@github.com:/coqui-ai-TTS.git + $ cd coqui-ai-TTS + $ git remote add upstream https://github.com/idiap/coqui-ai-TTS.git ``` 3. Install 🐸TTS for development. ```bash $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS. - $ make install + $ make install_dev ``` 4. Create a new branch with an informative name for your goal. @@ -82,13 +82,13 @@ The following steps are tested on an Ubuntu system. $ make test_all # run all the tests, report all the errors ``` -9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting. +9. Format your code. We use ```black``` for code formatting. ```bash $ make style ``` -10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions. +10. Run the linter and correct the issues raised. We use ```ruff``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions. ```bash $ make lint @@ -105,7 +105,7 @@ The following steps are tested on an Ubuntu system. ```bash $ git fetch upstream - $ git rebase upstream/master + $ git rebase upstream/main # or for the development version $ git rebase upstream/dev ``` @@ -124,7 +124,7 @@ The following steps are tested on an Ubuntu system. 13. Let's discuss until it is perfect. đŸ’Ē - We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls]. + We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/idiap/coqui-ai-TTS/pulls]. 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version. @@ -132,14 +132,14 @@ The following steps are tested on an Ubuntu system. If you prefer working within a Docker container as your development environment, you can do the following: -1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page. +1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page. 2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```. ```bash - $ git clone git@github.com:/TTS.git - $ cd TTS - $ git remote add upstream https://github.com/coqui-ai/TTS.git + $ git clone git@github.com:/coqui-ai-TTS.git + $ cd coqui-ai-TTS + $ git remote add upstream https://github.com/idiap/coqui-ai-TTS.git ``` 3. Build the Docker Image as your development environment (it installs all of the dependencies for you): diff --git a/Dockerfile b/Dockerfile index 9fb3005ef4..05c37d78fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,7 @@ FROM ${BASE} RUN apt-get update && apt-get upgrade -y RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* +RUN pip3 install -U pip setuptools RUN pip3 install llvmlite --ignore-installed # Install Dependencies: diff --git a/LICENSE.txt b/LICENSE.txt index 14e2f777f6..a612ad9813 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -35,7 +35,7 @@ Mozilla Public License Version 2.0 means any form of the work other than Source Code Form. 1.7. "Larger Work" - means a work that combines Covered Software with other material, in + means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. 1.8. "License" diff --git a/MANIFEST.in b/MANIFEST.in index 321d3999c1..8d092ceff2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,6 @@ include README.md include LICENSE.txt -include requirements.*.txt include *.cff -include requirements.txt -include TTS/VERSION recursive-include TTS *.json recursive-include TTS *.html recursive-include TTS *.png @@ -11,5 +8,3 @@ recursive-include TTS *.md recursive-include TTS *.py recursive-include TTS *.pyx recursive-include images *.png -recursive-exclude tests * -prune tests* diff --git a/Makefile b/Makefile index 7446848f46..077b4b23e5 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ .DEFAULT_GOAL := help -.PHONY: test system-deps dev-deps deps style lint install help docs +.PHONY: test system-deps dev-deps style lint install install_dev help docs help: @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' @@ -11,47 +11,50 @@ test_all: ## run tests and don't stop on an error. ./run_bash_tests.sh test: ## run tests. - nose2 -F -v -B --with-coverage --coverage TTS tests + coverage run -m nose2 -F -v -B tests test_vocoder: ## run vocoder tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests + coverage run -m nose2 -F -v -B tests.vocoder_tests test_tts: ## run tts tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests + coverage run -m nose2 -F -v -B tests.tts_tests test_tts2: ## run tts tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2 + coverage run -m nose2 -F -v -B tests.tts_tests2 test_xtts: - nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests + coverage run -m nose2 -F -v -B tests.xtts_tests test_aux: ## run aux tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests + coverage run -m nose2 -F -v -B tests.aux_tests ./run_bash_tests.sh -test_zoo: ## run zoo tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests +test_zoo0: ## run zoo tests. + coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \ + tests.zoo_tests.test_models.test_voice_conversion +test_zoo1: ## run zoo tests. + coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3 +test_zoo2: ## run zoo tests. + coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3 inference_tests: ## run inference tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests + coverage run -m nose2 -F -v -B tests.inference_tests data_tests: ## run data tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests + coverage run -m nose2 -F -v -B tests.data_tests test_text: ## run text tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests + coverage run -m nose2 -F -v -B tests.text_tests test_failed: ## only run tests failed the last time. - nose2 -F -v -B --with-coverage --coverage TTS tests + coverage run -m nose2 -F -v -B tests style: ## update code style. black ${target_dirs} - isort ${target_dirs} -lint: ## run pylint linter. - pylint ${target_dirs} +lint: ## run linters. + ruff check ${target_dirs} black ${target_dirs} --check - isort ${target_dirs} --check-only system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev @@ -59,20 +62,15 @@ system-deps: ## install linux system deps dev-deps: ## install development deps pip install -r requirements.dev.txt -doc-deps: ## install docs dependencies - pip install -r docs/requirements.txt - build-docs: ## build the docs cd docs && make clean && make build -hub-deps: ## install deps for torch hub use - pip install -r requirements.hub.txt - -deps: ## install 🐸 requirements. - pip install -r requirements.txt - -install: ## install 🐸 TTS for development. +install: ## install 🐸 TTS pip install -e .[all] +install_dev: ## install 🐸 TTS for development. + pip install -e .[all,dev] + pre-commit install + docs: ## build the docs $(MAKE) -C docs clean && $(MAKE) -C docs html diff --git a/README.md b/README.md index 891118c13d..c6a1db4fff 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,18 @@ -## 🐸Coqui.ai News +## 🐸Coqui TTS News +- đŸ“Ŗ Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts) - đŸ“Ŗ ⓍTTSv2 is here with 16 languages and better performance across the board. -- đŸ“Ŗ ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). +- đŸ“Ŗ ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech). - đŸ“Ŗ ⓍTTS can now stream with <200ms latency. -- đŸ“Ŗ ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) -- đŸ“Ŗ [đŸļBark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) +- đŸ“Ŗ ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html) +- đŸ“Ŗ [đŸļBark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html) - đŸ“Ŗ You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. -- đŸ“Ŗ 🐸TTS now supports đŸĸTortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html) -- đŸ“Ŗ Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice) -- đŸ“Ŗ Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). -- đŸ“Ŗ Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). +- đŸ“Ŗ 🐸TTS now supports đŸĸTortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/tortoise.html)
-## +## **🐸TTS is a library for advanced Text-to-Speech generation.** @@ -28,23 +26,15 @@ ______________________________________________________________________ [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) [![License]()](https://opensource.org/licenses/MPL-2.0) -[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) -[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) -[![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts) +[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://badge.fury.io/py/coqui-tts) +[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/idiap/coqui-ai-TTS/blob/main/CODE_OF_CONDUCT.md) +[![Downloads](https://pepy.tech/badge/coqui-tts)](https://pepy.tech/project/coqui-tts) [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg) -[![Docs]()](https://tts.readthedocs.io/en/latest/) +![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg) +![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg) +![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg) +[![Docs]()](https://coqui-tts.readthedocs.io/en/latest/)
@@ -60,28 +50,26 @@ Please use our dedicated channels for questions and discussion. Help is much mor | 👩‍đŸ’ģ **Usage Questions** | [GitHub Discussions] | | đŸ—¯ **General Discussion** | [GitHub Discussions] or [Discord] | -[github issue tracker]: https://github.com/coqui-ai/tts/issues -[github discussions]: https://github.com/coqui-ai/TTS/discussions +[github issue tracker]: https://github.com/idiap/coqui-ai-TTS/issues +[github discussions]: https://github.com/idiap/coqui-ai-TTS/discussions [discord]: https://discord.gg/5eXr5seRrv [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials +The [issues](https://github.com/coqui-ai/TTS/issues) and +[discussions](https://github.com/coqui-ai/TTS/discussions) in the original +repository are also still a useful source of information. + ## 🔗 Links and Resources | Type | Links | | ------------------------------- | --------------------------------------- | -| đŸ’ŧ **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) -| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)| -| 👩‍đŸ’ģ **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)| +| đŸ’ŧ **Documentation** | [ReadTheDocs](https://coqui-tts.readthedocs.io/en/latest/) +| 💾 **Installation** | [TTS/README.md](https://github.com/idiap/coqui-ai-TTS/tree/dev#installation)| +| 👩‍đŸ’ģ **Contributing** | [CONTRIBUTING.md](https://github.com/idiap/coqui-ai-TTS/blob/main/CONTRIBUTING.md)| | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378) -| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)| +| 🚀 **Released Models** | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)| | 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)| - -## đŸĨ‡ TTS Performance -

- -Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices. - ## Features - High-performance Deep Learning models for Text2Speech tasks. - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech). @@ -147,21 +135,48 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea You can also help us implement more models. ## Installation -🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**. +🐸TTS is tested on Ubuntu 22.04 with **python >= 3.9, < 3.13.**. -If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. +If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. ```bash -pip install TTS +pip install coqui-tts ``` If you plan to code or train models, clone 🐸TTS and install it locally. ```bash -git clone https://github.com/coqui-ai/TTS -pip install -e .[all,dev,notebooks] # Select the relevant extras +git clone https://github.com/idiap/coqui-ai-TTS +cd coqui-ai-TTS +pip install -e . ``` +### Optional dependencies + +The following extras allow the installation of optional dependencies: + +| Name | Description | +|------|-------------| +| `all` | All optional dependencies, except `dev` and `docs` | +| `dev` | Development dependencies | +| `docs` | Dependencies for building the documentation | +| `notebooks` | Dependencies only used in notebooks | +| `server` | Dependencies to run the TTS server | +| `bn` | Bangla G2P | +| `ja` | Japanese G2P | +| `ko` | Korean G2P | +| `zh` | Chinese G2P | +| `languages` | All language-specific dependencies | + +You can install extras with one of the following commands: + +```bash +pip install coqui-tts[server,ja] +pip install -e .[server,ja] +``` + +### Platforms + If you are on Ubuntu (Debian), you can also run following commands for installation. ```bash @@ -169,7 +184,9 @@ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you $ make install ``` -If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system). +If you are on Windows, 👑@GuyPaddock wrote installation instructions +[here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system) +(note that these are out of date, e.g. you need to have at least Python 3.9). ## Docker Image @@ -183,7 +200,8 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a s ``` You can then enjoy the TTS server [here](http://[::1]:5002/) -More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html) +More details about the docker images (like GPU support) can be found +[here](https://coqui-tts.readthedocs.io/en/latest/docker_images.html) ## Synthesizing speech by 🐸TTS @@ -257,11 +275,10 @@ You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tt and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). ```python -# TTS with on the fly voice conversion +# TTS with fairseq models api = TTS("tts_models/deu/fairseq/vits") -api.tts_with_vc_to_file( +api.tts_to_file( "Wie sage ich auf Italienisch, dass ich dich liebe?", - speaker_wav="target/speaker.wav", file_path="output.wav" ) ``` diff --git a/TTS/.models.json b/TTS/.models.json index b349e7397b..a77ebea1cf 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -46,7 +46,7 @@ "hf_url": [ "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt", "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", - "https://coqui.gateway.scarf.sh/hf/text_2.pt", + "https://coqui.gateway.scarf.sh/hf/bark/text_2.pt", "https://coqui.gateway.scarf.sh/hf/bark/config.json", "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" diff --git a/TTS/VERSION b/TTS/VERSION deleted file mode 100644 index 2157409059..0000000000 --- a/TTS/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.22.0 diff --git a/TTS/__init__.py b/TTS/__init__.py index eaf05db1b9..9e87bca4be 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,6 +1,3 @@ -import os +import importlib.metadata -with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f: - version = f.read().strip() - -__version__ = version +__version__ = importlib.metadata.version("coqui-tts") diff --git a/TTS/api.py b/TTS/api.py index 7abc188e74..250ed1a0d9 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -1,15 +1,16 @@ +import logging import tempfile import warnings from pathlib import Path -from typing import Union -import numpy as np from torch import nn +from TTS.config import load_config from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer -from TTS.config import load_config + +logger = logging.getLogger(__name__) class TTS(nn.Module): @@ -61,7 +62,7 @@ def __init__( gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ super().__init__() - self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) + self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar) self.config = load_config(config_path) if config_path else None self.synthesizer = None self.voice_converter = None @@ -99,7 +100,7 @@ def is_multi_lingual(self): isinstance(self.model_name, str) and "xtts" in self.model_name or self.config - and ("xtts" in self.config.model or len(self.config.languages) > 1) + and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1) ): return True if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: @@ -122,8 +123,9 @@ def languages(self): def get_models_file_path(): return Path(__file__).parent / ".models.json" - def list_models(self): - return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) + @staticmethod + def list_models(): + return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models() def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) @@ -168,9 +170,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): self.synthesizer = None self.model_name = model_name - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( - model_name - ) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name) # init synthesizer # None values are fetch from the model @@ -231,7 +231,7 @@ def _check_arguments( raise ValueError("Model is not multi-speaker but `speaker` is provided.") if not self.is_multi_lingual and language is not None: raise ValueError("Model is not multi-lingual but `language` is provided.") - if not emotion is None and not speed is None: + if emotion is not None and speed is not None: raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.") def tts( diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py index 662fcd02ec..32aa303e6e 100644 --- a/TTS/bin/collect_env_info.py +++ b/TTS/bin/collect_env_info.py @@ -1,4 +1,6 @@ """Get detailed info about the working environment.""" + +import json import os import platform import sys @@ -6,11 +8,10 @@ import numpy import torch -sys.path += [os.path.abspath(".."), os.path.abspath(".")] -import json - import TTS +sys.path += [os.path.abspath(".."), os.path.abspath(".")] + def system_info(): return { diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 9ab520be7d..127199186b 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -1,5 +1,6 @@ import argparse import importlib +import logging import os from argparse import RawTextHelpFormatter @@ -7,15 +8,18 @@ import torch from torch.utils.data import DataLoader from tqdm import tqdm +from trainer.io import load_checkpoint from TTS.config import load_config from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_checkpoint +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Extract attention masks from trained Tacotron/Tacotron2 models. @@ -31,7 +35,7 @@ --data_path /root/LJSpeech-1.1/ --batch_size 32 --dataset ljspeech - --use_cuda True + --use_cuda """, formatter_class=RawTextHelpFormatter, ) @@ -58,7 +62,7 @@ help="Dataset metafile inclusing file paths with transcripts.", ) parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.") - parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.") + parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.") parser.add_argument( "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA." @@ -70,7 +74,7 @@ # if the vocabulary was passed, replace the default if "characters" in C.keys(): - symbols, phonemes = make_symbols(**C.characters) + symbols, phonemes = make_symbols(**C.characters) # noqa: F811 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 5b5a37df73..1bdb8d733c 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,4 +1,5 @@ import argparse +import logging import os from argparse import RawTextHelpFormatter @@ -10,6 +11,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.managers import save_file from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger def compute_embeddings( @@ -100,6 +102,8 @@ def compute_embeddings( if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + parser = argparse.ArgumentParser( description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" """ @@ -146,7 +150,7 @@ def compute_embeddings( default=False, action="store_true", ) - parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False) + parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False) parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") parser.add_argument( "--formatter_name", diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 3ab7ea7a3b..dc5423a691 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -3,6 +3,7 @@ import argparse import glob +import logging import os import numpy as np @@ -12,10 +13,13 @@ from TTS.config import load_config from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger def main(): """Run preprocessing process.""" + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index 60fed13932..711c8221db 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -1,4 +1,5 @@ import argparse +import logging from argparse import RawTextHelpFormatter import torch @@ -7,6 +8,7 @@ from TTS.config import load_config from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger def compute_encoder_accuracy(dataset_items, encoder_manager): @@ -51,6 +53,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager): if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + parser = argparse.ArgumentParser( description="""Compute the accuracy of the encoder.\n\n""" """ @@ -71,8 +75,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager): type=str, help="Path to dataset config file.", ) - parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) - parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True) + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) args = parser.parse_args() diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index c6048626b3..86a4dce177 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -2,12 +2,14 @@ """Extract Mel spectrograms with teacher forcing.""" import argparse +import logging import os import numpy as np import torch from torch.utils.data import DataLoader from tqdm import tqdm +from trainer.generic_utils import count_parameters from TTS.config import load_config from TTS.tts.datasets import TTSDataset, load_tts_samples @@ -16,12 +18,12 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import quantize -from TTS.utils.generic_utils import count_parameters +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger use_cuda = torch.cuda.is_available() -def setup_loader(ap, r, verbose=False): +def setup_loader(ap, r): tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, @@ -37,7 +39,6 @@ def setup_loader(ap, r, verbose=False): phoneme_cache_path=c.phoneme_cache_path, precompute_num_workers=0, use_noise_augment=False, - verbose=verbose, speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None, d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, ) @@ -257,7 +258,7 @@ def main(args): # pylint: disable=redefined-outer-name print("\n > Model has {} parameters".format(num_params), flush=True) # set r r = 1 if c.model.lower() == "glow_tts" else model.decoder.r - own_loader = setup_loader(ap, r, verbose=True) + own_loader = setup_loader(ap, r) extract_spectrograms( own_loader, @@ -272,6 +273,8 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + parser = argparse.ArgumentParser() parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) @@ -279,7 +282,7 @@ def main(args): # pylint: disable=redefined-outer-name parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") - parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) args = parser.parse_args() c = load_config(args.config_path) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index ea16974839..0519d43769 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -1,12 +1,17 @@ """Find all the unique characters in a dataset""" + import argparse +import logging from argparse import RawTextHelpFormatter from TTS.config import load_config -from TTS.tts.datasets import load_tts_samples +from TTS.tts.datasets import find_unique_chars, load_tts_samples +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger def main(): + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" @@ -28,17 +33,7 @@ def main(): ) items = train_items + eval_items - - texts = "".join(item["text"] for item in items) - chars = set(texts) - lower_chars = filter(lambda c: c.islower(), chars) - chars_force_lower = [c.lower() for c in chars] - chars_force_lower = set(chars_force_lower) - - print(f" > Number of unique characters: {len(chars)}") - print(f" > Unique characters: {''.join(sorted(chars))}") - print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") - print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") + find_unique_chars(items) if __name__ == "__main__": diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 4bd7a78eef..d99acb9893 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,5 +1,7 @@ """Find all the unique characters in a dataset""" + import argparse +import logging import multiprocessing from argparse import RawTextHelpFormatter @@ -8,15 +10,18 @@ from TTS.config import load_config from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.text.phonemizers import Gruut +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger def compute_phonemes(item): text = item["text"] ph = phonemizer.phonemize(text).replace("|", "") - return set(list(ph)) + return set(ph) def main(): + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + # pylint: disable=W0601 global c, phonemizer # pylint: disable=bad-option-value diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index a1eaf4c9a7..edab882db8 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,5 +1,6 @@ import argparse import glob +import logging import multiprocessing import os import pathlib @@ -7,6 +8,7 @@ import torch from tqdm import tqdm +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.utils.vad import get_vad_model_and_utils, remove_silence torch.set_num_threads(1) @@ -75,8 +77,10 @@ def preprocess_audios(): if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + parser = argparse.ArgumentParser( - description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" + description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end" ) parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True) parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="") @@ -91,20 +95,20 @@ def preprocess_audios(): parser.add_argument( "-t", "--trim_just_beginning_and_end", - type=bool, + action=argparse.BooleanOptionalAction, default=True, - help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True", + help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.", ) parser.add_argument( "-c", "--use_cuda", - type=bool, + action=argparse.BooleanOptionalAction, default=False, help="If True use cuda", ) parser.add_argument( "--use_onnx", - type=bool, + action=argparse.BooleanOptionalAction, default=False, help="If True use onnx", ) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index b86252ab67..bc01ffd595 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -1,14 +1,20 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- + +"""Command line interface.""" import argparse import contextlib +import logging import sys from argparse import RawTextHelpFormatter # pylint: disable=redefined-outer-name, unused-argument from pathlib import Path +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger + +logger = logging.getLogger(__name__) + description = """ Synthesize speech on command line. @@ -131,17 +137,8 @@ """ -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - if v.lower() in ("no", "false", "f", "n", "0"): - return False - raise argparse.ArgumentTypeError("Boolean value expected.") - - -def main(): +def parse_args() -> argparse.Namespace: + """Parse arguments.""" parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), formatter_class=RawTextHelpFormatter, @@ -149,10 +146,7 @@ def main(): parser.add_argument( "--list_models", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", help="list available pre-trained TTS and vocoder models.", ) @@ -200,7 +194,7 @@ def main(): default="tts_output.wav", help="Output wav file path.", ) - parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) + parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.") parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu") parser.add_argument( "--vocoder_path", @@ -219,12 +213,9 @@ def main(): parser.add_argument( "--pipe_out", help="stdout the generated TTS wav file for shell pipe.", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", ) - + # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) @@ -254,25 +245,18 @@ def main(): parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", ) parser.add_argument( "--list_language_idxs", help="List available language ids for the defined multi-lingual model.", - type=str2bool, - nargs="?", - const=True, - default=False, + action="store_true", ) # aux args parser.add_argument( "--save_spectogram", - type=bool, - help="If true save raw spectogram for further (vocoder) processing in out_path.", - default=False, + action="store_true", + help="Save raw spectogram for further (vocoder) processing in out_path.", ) parser.add_argument( "--reference_wav", @@ -288,8 +272,8 @@ def main(): ) parser.add_argument( "--progress_bar", - type=str2bool, - help="If true shows a progress bar for the model download. Defaults to True", + action=argparse.BooleanOptionalAction, + help="Show a progress bar for the model download.", default=True, ) @@ -330,19 +314,23 @@ def main(): ] if not any(check_args): parser.parse_args(["-h"]) + return args + + +def main(): + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + args = parse_args() pipe_out = sys.stdout if args.pipe_out else None with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): # Late-import to make things load faster - from TTS.api import TTS from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path, progress_bar=args.progress_bar) - api = TTS() tts_path = None tts_config_path = None @@ -379,10 +367,8 @@ def main(): if model_item["model_type"] == "tts_models": tts_path = model_path tts_config_path = config_path - if "default_vocoder" in model_item: - args.vocoder_name = ( - model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - ) + if args.vocoder_name is None and "default_vocoder" in model_item: + args.vocoder_name = model_item["default_vocoder"] # voice conversion model if model_item["model_type"] == "voice_conversion_models": @@ -437,31 +423,37 @@ def main(): # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: - print( - " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." + if synthesizer.tts_model.speaker_manager is None: + logger.info("Model only has a single speaker.") + return + logger.info( + "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - print(synthesizer.tts_model.speaker_manager.name_to_id) + logger.info(synthesizer.tts_model.speaker_manager.name_to_id) return # query langauge ids of a multi-lingual model. if args.list_language_idxs: - print( - " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + if synthesizer.tts_model.language_manager is None: + logger.info("Monolingual model.") + return + logger.info( + "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) - print(synthesizer.tts_model.language_manager.name_to_id) + logger.info(synthesizer.tts_model.language_manager.name_to_id) return # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): - print( - " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + logger.error( + "Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) return # RUN THE SYNTHESIS if args.text: - print(" > Text: {}".format(args.text)) + logger.info("Text: %s", args.text) # kick it if tts_path is not None: @@ -486,8 +478,8 @@ def main(): ) # save the results - print(" > Saving output to {}".format(args.out_path)) synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out) + logger.info("Saved output to %s", args.out_path) if __name__ == "__main__": diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 448fefc712..ba03c42b6d 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -1,13 +1,16 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import logging import os import sys import time import traceback +import warnings import torch from torch.utils.data import DataLoader +from trainer.generic_utils import count_parameters, remove_experiment_folder from trainer.io import copy_model_files, save_best_model, save_checkpoint from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer @@ -18,7 +21,7 @@ from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import count_parameters, remove_experiment_folder +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.utils.samplers import PerfectBatchSampler from TTS.utils.training import check_update @@ -31,7 +34,7 @@ print(" > Number of GPUs: ", num_gpus) -def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False): +def setup_loader(ap: AudioProcessor, is_val: bool = False): num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch @@ -42,7 +45,6 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False voice_len=c.voice_len, num_utter_per_class=num_utter_per_class, num_classes_in_batch=num_classes_in_batch, - verbose=verbose, augmentation_config=c.audio_augmentation if not is_val else None, use_torch_spec=c.model_params.get("use_torch_spec", False), ) @@ -115,17 +117,20 @@ def evaluation(model, criterion, data_loader, global_step): eval_avg_loss = eval_loss / len(data_loader) # save stats dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss}) - # plot the last batch in the evaluation - figures = { - "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), - } - dashboard_logger.eval_figures(global_step, figures) + try: + # plot the last batch in the evaluation + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.eval_figures(global_step, figures) + except ImportError: + warnings.warn("Install the `umap-learn` package to see embedding plots.") return eval_avg_loss def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): model.train() - best_loss = float("inf") + best_loss = {"train_loss": None, "eval_loss": float("inf")} avg_loader_time = 0 end_time = time.time() for epoch in range(c.epochs): @@ -160,9 +165,6 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, loader_time = time.time() - end_time global_step += 1 - # setup lr - if c.lr_decay: - scheduler.step() optimizer.zero_grad() # dispatch data to GPU @@ -181,6 +183,10 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() + # setup lr + if c.lr_decay: + scheduler.step() + step_time = time.time() - start_time epoch_time += step_time @@ -248,7 +254,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, ) # save the best checkpoint best_loss = save_best_model( - eval_loss, + {"train_loss": None, "eval_loss": eval_loss}, best_loss, c, model, @@ -278,9 +284,9 @@ def main(args): # pylint: disable=redefined-outer-name # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) - train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True) + train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False) if c.run_eval: - eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True) + eval_data_loader, _, _ = setup_loader(ap, is_val=True) else: eval_data_loader = None @@ -316,6 +322,8 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training() try: diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index bdb4f6f691..6d6342a762 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,3 +1,4 @@ +import logging import os from dataclasses import dataclass, field @@ -6,6 +7,7 @@ from TTS.config import load_config, register_config from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger @dataclass @@ -15,6 +17,8 @@ class TrainTTSArgs(TrainerArgs): def main(): """Run `tts` model training directly by a `config.json` file.""" + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + # init trainer args train_args = TrainTTSArgs() parser = train_args.init_argparse(arg_prefix="") diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index 32ecd7bdc3..221ff4cff0 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -1,3 +1,4 @@ +import logging import os from dataclasses import dataclass, field @@ -5,6 +6,7 @@ from TTS.config import load_config, register_config from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.models import setup_model @@ -16,6 +18,8 @@ class TrainVocoderArgs(TrainerArgs): def main(): """Run `tts` model training directly by a `config.json` file.""" + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + # init trainer args train_args = TrainVocoderArgs() parser = train_args.init_argparse(arg_prefix="") diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index 09582cea7c..df2923952d 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -1,5 +1,7 @@ """Search a good noise schedule for WaveGrad for a given number of inference iterations""" + import argparse +import logging from itertools import product as cartesian_product import numpy as np @@ -9,11 +11,14 @@ from TTS.config import load_config from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset from TTS.vocoder.models import setup_model if __name__ == "__main__": + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, help="Path to model checkpoint.") parser.add_argument("--config_path", type=str, help="Path to model config file.") @@ -54,7 +59,6 @@ return_segments=False, use_noise_augment=False, use_cache=False, - verbose=True, ) loader = DataLoader( dataset, diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index c5a6dd68e2..5103f200b0 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -17,9 +17,12 @@ def read_json_with_comments(json_path): with fsspec.open(json_path, "r", encoding="utf-8") as f: input_str = f.read() # handle comments but not urls with // - input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str) + input_str = re.sub( + r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str + ) return json.loads(input_str) + def register_config(model_name: str) -> Coqpit: """Find the right config for the given model name. diff --git a/TTS/demos/xtts_ft_demo/requirements.txt b/TTS/demos/xtts_ft_demo/requirements.txt index cb5b16f66e..b58f41c546 100644 --- a/TTS/demos/xtts_ft_demo/requirements.txt +++ b/TTS/demos/xtts_ft_demo/requirements.txt @@ -1,2 +1,2 @@ faster_whisper==0.9.0 -gradio==4.7.1 \ No newline at end of file +gradio==4.7.1 diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index 536faa0108..40e8b8ed32 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -1,23 +1,17 @@ -import os import gc -import torchaudio +import os + import pandas +import torch +import torchaudio from faster_whisper import WhisperModel -from glob import glob - from tqdm import tqdm -import torch -import torchaudio # torch.set_num_threads(1) - from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners torch.set_num_threads(16) - -import os - audio_types = (".wav", ".mp3", ".flac") @@ -25,9 +19,10 @@ def list_audios(basePath, contains=None): # return the set of files that are valid return list_files(basePath, validExts=audio_types, contains=contains) + def list_files(basePath, validExts=None, contains=None): # loop over the directory structure - for (rootDir, dirNames, filenames) in os.walk(basePath): + for rootDir, dirNames, filenames in os.walk(basePath): # loop over the filenames in the current directory for filename in filenames: # if the contains string is not none and the filename does not contain @@ -36,7 +31,7 @@ def list_files(basePath, validExts=None, contains=None): continue # determine the file extension of the current file - ext = filename[filename.rfind("."):].lower() + ext = filename[filename.rfind(".") :].lower() # check to see if the file is an audio and should be processed if validExts is None or ext.endswith(validExts): @@ -44,13 +39,22 @@ def list_files(basePath, validExts=None, contains=None): audioPath = os.path.join(rootDir, filename) yield audioPath -def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): + +def format_audio_list( + audio_files, + target_language="en", + out_path=None, + buffer=0.2, + eval_percentage=0.15, + speaker_name="coqui", + gradio_progress=None, +): audio_total_size = 0 # make sure that ooutput file exists os.makedirs(out_path, exist_ok=True) # Loading Whisper - device = "cuda" if torch.cuda.is_available() else "cpu" + device = "cuda" if torch.cuda.is_available() else "cpu" print("Loading Whisper Model!") asr_model = WhisperModel("large-v2", device=device, compute_type="float16") @@ -69,7 +73,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 wav = torch.mean(wav, dim=0, keepdim=True) wav = wav.squeeze() - audio_total_size += (wav.size(-1) / sr) + audio_total_size += wav.size(-1) / sr segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) segments = list(segments) @@ -94,7 +98,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 # get previous sentence end previous_word_end = words_list[word_idx - 1].end # add buffer or get the silence midle between the previous sentence and the current one - sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2) + sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2) sentence = word.word first_word = False @@ -118,19 +122,16 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 # Average the current word end and next word start word_end = min((word.end + next_word_start) / 2, word.end + buffer) - + absoulte_path = os.path.join(out_path, audio_file) os.makedirs(os.path.dirname(absoulte_path), exist_ok=True) i += 1 first_word = True - audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0) + audio = wav[int(sr * sentence_start) : int(sr * word_end)].unsqueeze(0) # if the audio is too short ignore it (i.e < 0.33 seconds) - if audio.size(-1) >= sr/3: - torchaudio.save(absoulte_path, - audio, - sr - ) + if audio.size(-1) >= sr / 3: + torchaudio.save(absoulte_path, audio, sr) else: continue @@ -140,21 +141,21 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 df = pandas.DataFrame(metadata) df = df.sample(frac=1) - num_val_samples = int(len(df)*eval_percentage) + num_val_samples = int(len(df) * eval_percentage) df_eval = df[:num_val_samples] df_train = df[num_val_samples:] - df_train = df_train.sort_values('audio_file') + df_train = df_train.sort_values("audio_file") train_metadata_path = os.path.join(out_path, "metadata_train.csv") df_train.to_csv(train_metadata_path, sep="|", index=False) eval_metadata_path = os.path.join(out_path, "metadata_eval.csv") - df_eval = df_eval.sort_values('audio_file') + df_eval = df_eval.sort_values("audio_file") df_eval.to_csv(eval_metadata_path, sep="|", index=False) # deallocate VRAM and RAM del asr_model, df_train, df_eval, df, metadata gc.collect() - return train_metadata_path, eval_metadata_path, audio_total_size \ No newline at end of file + return train_metadata_path, eval_metadata_path, audio_total_size diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index a98765c3e7..7b41966b8f 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -1,5 +1,5 @@ -import os import gc +import os from trainer import Trainer, TrainerArgs @@ -25,7 +25,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, BATCH_SIZE = batch_size # set here the batch size GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps - # Define here the dataset that you want to use for the fine-tuning on. config_dataset = BaseDatasetConfig( formatter="coqui", @@ -43,7 +42,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/") os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) - # DVAE files DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" @@ -55,8 +53,9 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, # download DVAE files if needed if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): print(" > Downloading DVAE files!") - ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) - + ModelManager._download_model_files( + [MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True + ) # Download XTTS v2.0 checkpoint if needed TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" @@ -160,7 +159,7 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, # get the longest text audio file to use as speaker reference samples_len = [len(item["text"].split(" ")) for item in train_samples] - longest_text_idx = samples_len.index(max(samples_len)) + longest_text_idx = samples_len.index(max(samples_len)) speaker_ref = train_samples[longest_text_idx]["audio_file"] trainer_out_path = trainer.output_path diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index ebb11f29d1..7ac38ed6ee 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -1,19 +1,16 @@ import argparse +import logging import os import sys import tempfile +import traceback import gradio as gr -import librosa.display -import numpy as np - -import os import torch import torchaudio -import traceback + from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt - from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts @@ -23,7 +20,10 @@ def clear_gpu_cache(): if torch.cuda.is_available(): torch.cuda.empty_cache() + XTTS_MODEL = None + + def load_model(xtts_checkpoint, xtts_config, xtts_vocab): global XTTS_MODEL clear_gpu_cache() @@ -40,17 +40,23 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab): print("Model Loaded!") return "Model Loaded!" + def run_tts(lang, tts_text, speaker_audio_file): if XTTS_MODEL is None or not speaker_audio_file: return "You need to run the previous step to load the model !!", None, None - gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) + gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents( + audio_path=speaker_audio_file, + gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, + max_ref_length=XTTS_MODEL.config.max_ref_len, + sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, + ) out = XTTS_MODEL.inference( text=tts_text, language=lang, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, - temperature=XTTS_MODEL.config.temperature, # Add custom parameters here + temperature=XTTS_MODEL.config.temperature, # Add custom parameters here length_penalty=XTTS_MODEL.config.length_penalty, repetition_penalty=XTTS_MODEL.config.repetition_penalty, top_k=XTTS_MODEL.config.top_k, @@ -65,9 +71,7 @@ def run_tts(lang, tts_text, speaker_audio_file): return "Speech generated !", out_path, speaker_audio_file - - -# define a logger to redirect +# define a logger to redirect class Logger: def __init__(self, filename="log.out"): self.log_file = filename @@ -85,21 +89,19 @@ def flush(self): def isatty(self): return False + # redirect stdout and stderr to a file sys.stdout = Logger() sys.stderr = sys.stdout # logging.basicConfig(stream=sys.stdout, level=logging.INFO) -import logging + logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.StreamHandler(sys.stdout) - ] + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)] ) + def read_logs(): sys.stdout.flush() with open(sys.stdout.log_file, "r") as f: @@ -107,12 +109,11 @@ def read_logs(): if __name__ == "__main__": - parser = argparse.ArgumentParser( description="""XTTS fine-tuning demo\n\n""" """ Example runs: - python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port + python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port """, formatter_class=argparse.RawTextHelpFormatter, ) @@ -190,12 +191,11 @@ def read_logs(): "zh", "hu", "ko", - "ja" + "ja", + "hi", ], ) - progress_data = gr.Label( - label="Progress:" - ) + progress_data = gr.Label(label="Progress:") logs = gr.Textbox( label="Logs:", interactive=False, @@ -203,20 +203,30 @@ def read_logs(): demo.load(read_logs, None, logs, every=1) prompt_compute_btn = gr.Button(value="Step 1 - Create dataset") - + def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): clear_gpu_cache() out_path = os.path.join(out_path, "dataset") os.makedirs(out_path, exist_ok=True) if audio_path is None: - return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", "" + return ( + "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", + "", + "", + ) else: try: - train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + train_meta, eval_meta, audio_total_size = format_audio_list( + audio_path, target_language=language, out_path=out_path, gradio_progress=progress + ) except: traceback.print_exc() error = traceback.format_exc() - return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", "" + return ( + f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", + "", + "", + ) clear_gpu_cache() @@ -236,7 +246,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac eval_csv = gr.Textbox( label="Eval CSV:", ) - num_epochs = gr.Slider( + num_epochs = gr.Slider( label="Number of epochs:", minimum=1, maximum=100, @@ -264,9 +274,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac step=1, value=args.max_audio_length, ) - progress_train = gr.Label( - label="Progress:" - ) + progress_train = gr.Label(label="Progress:") logs_tts_train = gr.Textbox( label="Logs:", interactive=False, @@ -274,18 +282,41 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac demo.load(read_logs, None, logs_tts_train, every=1) train_btn = gr.Button(value="Step 2 - Run the training") - def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length): + def train_model( + language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length + ): clear_gpu_cache() if not train_csv or not eval_csv: - return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", "" + return ( + "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", + "", + "", + "", + "", + ) try: # convert seconds to waveform frames max_audio_length = int(max_audio_length * 22050) - config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length) + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt( + language, + num_epochs, + batch_size, + grad_acumm, + train_csv, + eval_csv, + output_path=output_path, + max_audio_length=max_audio_length, + ) except: traceback.print_exc() error = traceback.format_exc() - return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", "" + return ( + f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", + "", + "", + "", + "", + ) # copy original files to avoid parameters changes issues os.system(f"cp {config_path} {exp_path}") @@ -312,9 +343,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum label="XTTS vocab path:", value="", ) - progress_load = gr.Label( - label="Progress:" - ) + progress_load = gr.Label(label="Progress:") load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model") with gr.Column() as col2: @@ -342,7 +371,8 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum "hu", "ko", "ja", - ] + "hi", + ], ) tts_text = gr.Textbox( label="Input Text.", @@ -351,9 +381,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum tts_btn = gr.Button(value="Step 4 - Inference") with gr.Column() as col3: - progress_gen = gr.Label( - label="Progress:" - ) + progress_gen = gr.Label(label="Progress:") tts_output_audio = gr.Audio(label="Generated Audio.") reference_audio = gr.Audio(label="Reference audio used.") @@ -371,7 +399,6 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum ], ) - train_btn.click( fn=train_model, inputs=[ @@ -386,14 +413,10 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum ], outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) - + load_btn.click( fn=load_model, - inputs=[ - xtts_checkpoint, - xtts_config, - xtts_vocab - ], + inputs=[xtts_checkpoint, xtts_config, xtts_vocab], outputs=[progress_load], ) @@ -407,9 +430,4 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum outputs=[progress_gen, tts_output_audio, reference_audio], ) - demo.launch( - share=True, - debug=False, - server_port=args.port, - server_name="0.0.0.0" - ) + demo.launch(share=True, debug=False, server_port=args.port, server_name="0.0.0.0") diff --git a/TTS/encoder/README.md b/TTS/encoder/README.md index b38b20052b..9f829c9e2a 100644 --- a/TTS/encoder/README.md +++ b/TTS/encoder/README.md @@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS. - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` -- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. +- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. - Watch training on Tensorboard as in TTS diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py index 5eda2671be..1d12325cf2 100644 --- a/TTS/encoder/configs/emotion_encoder_config.py +++ b/TTS/encoder/configs/emotion_encoder_config.py @@ -1,4 +1,4 @@ -from dataclasses import asdict, dataclass +from dataclasses import dataclass from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py index 6dceb00277..0588527a68 100644 --- a/TTS/encoder/configs/speaker_encoder_config.py +++ b/TTS/encoder/configs/speaker_encoder_config.py @@ -1,4 +1,4 @@ -from dataclasses import asdict, dataclass +from dataclasses import dataclass from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py index 582b1fe9ca..bb780e3c1d 100644 --- a/TTS/encoder/dataset.py +++ b/TTS/encoder/dataset.py @@ -1,3 +1,4 @@ +import logging import random import torch @@ -5,6 +6,8 @@ from TTS.encoder.utils.generic_utils import AugmentWAV +logger = logging.getLogger(__name__) + class EncoderDataset(Dataset): def __init__( @@ -15,7 +18,6 @@ def __init__( voice_len=1.6, num_classes_in_batch=64, num_utter_per_class=10, - verbose=False, augmentation_config=None, use_torch_spec=None, ): @@ -24,7 +26,6 @@ def __init__( ap (TTS.tts.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. seq_len (int): voice segment length in seconds. - verbose (bool): print diagnostic information. """ super().__init__() self.config = config @@ -33,7 +34,6 @@ def __init__( self.seq_len = int(voice_len * self.sample_rate) self.num_utter_per_class = num_utter_per_class self.ap = ap - self.verbose = verbose self.use_torch_spec = use_torch_spec self.classes, self.items = self.__parse_items() @@ -50,13 +50,12 @@ def __init__( if "gaussian" in augmentation_config.keys(): self.gaussian_augmentation_config = augmentation_config["gaussian"] - if self.verbose: - print("\n > DataLoader initialization") - print(f" | > Classes per Batch: {num_classes_in_batch}") - print(f" | > Number of instances : {len(self.items)}") - print(f" | > Sequence length: {self.seq_len}") - print(f" | > Num Classes: {len(self.classes)}") - print(f" | > Classes: {self.classes}") + logger.info("DataLoader initialization") + logger.info(" | Classes per batch: %d", num_classes_in_batch) + logger.info(" | Number of instances: %d", len(self.items)) + logger.info(" | Sequence length: %d", self.seq_len) + logger.info(" | Number of classes: %d", len(self.classes)) + logger.info(" | Classes: %s", self.classes) def load_wav(self, filename): audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) diff --git a/TTS/encoder/losses.py b/TTS/encoder/losses.py index 5b5aa0fc48..2e27848c31 100644 --- a/TTS/encoder/losses.py +++ b/TTS/encoder/losses.py @@ -1,7 +1,11 @@ +import logging + import torch import torch.nn.functional as F from torch import nn +logger = logging.getLogger(__name__) + # adapted from https://github.com/cvqluu/GE2E-Loss class GE2ELoss(nn.Module): @@ -23,7 +27,7 @@ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"): self.b = nn.Parameter(torch.tensor(init_b)) self.loss_method = loss_method - print(" > Initialized Generalized End-to-End loss") + logger.info("Initialized Generalized End-to-End loss") assert self.loss_method in ["softmax", "contrast"] @@ -139,7 +143,7 @@ def __init__(self, init_w=10.0, init_b=-5.0): self.b = nn.Parameter(torch.tensor(init_b)) self.criterion = torch.nn.CrossEntropyLoss() - print(" > Initialized Angular Prototypical loss") + logger.info("Initialized Angular Prototypical loss") def forward(self, x, _label=None): """ @@ -177,7 +181,7 @@ def __init__(self, embedding_dim, n_speakers): self.criterion = torch.nn.CrossEntropyLoss() self.fc = nn.Linear(embedding_dim, n_speakers) - print("Initialised Softmax Loss") + logger.info("Initialised Softmax Loss") def forward(self, x, label=None): # reshape for compatibility @@ -212,7 +216,7 @@ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0): self.softmax = SoftmaxLoss(embedding_dim, n_speakers) self.angleproto = AngleProtoLoss(init_w, init_b) - print("Initialised SoftmaxAnglePrototypical Loss") + logger.info("Initialised SoftmaxAnglePrototypical Loss") def forward(self, x, label=None): """ diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index 957ea3c4ca..f7137c2186 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -1,12 +1,16 @@ +import logging + import numpy as np import torch import torchaudio from coqpit import Coqpit from torch import nn +from trainer.io import load_fsspec from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.utils.generic_utils import set_init_dict -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) class PreEmphasis(nn.Module): @@ -118,13 +122,13 @@ def load_checkpoint( state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) try: self.load_state_dict(state["model"]) - print(" > Model fully restored. ") + logger.info("Model fully restored. ") except (KeyError, RuntimeError) as error: # If eval raise the error if eval: raise error - print(" > Partial model initialization.") + logger.info("Partial model initialization.") model_dict = self.state_dict() model_dict = set_init_dict(model_dict, state["model"], c) self.load_state_dict(model_dict) @@ -135,7 +139,7 @@ def load_checkpoint( try: criterion.load_state_dict(state["criterion"]) except (KeyError, RuntimeError) as error: - print(" > Criterion load ignored because of:", error) + logger.exception("Criterion load ignored because of: %s", error) # instance and load the criterion for the encoder classifier in inference time if ( diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 236d6fe937..495b4def5a 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -1,4 +1,5 @@ import glob +import logging import os import random @@ -8,6 +9,8 @@ from TTS.encoder.models.lstm import LSTMSpeakerEncoder from TTS.encoder.models.resnet import ResNetSpeakerEncoder +logger = logging.getLogger(__name__) + class AugmentWAV(object): def __init__(self, ap, augmentation_config): @@ -34,12 +37,14 @@ def __init__(self, ap, augmentation_config): # ignore not listed directories if noise_dir not in self.additive_noise_types: continue - if not noise_dir in self.noise_list: + if noise_dir not in self.noise_list: self.noise_list[noise_dir] = [] self.noise_list[noise_dir].append(wav_file) - print( - f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}" + logger.info( + "Using Additive Noise Augmentation: with %d audios instances from %s", + len(additive_files), + self.additive_noise_types, ) self.use_rir = False @@ -50,7 +55,7 @@ def __init__(self, ap, augmentation_config): self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True) self.use_rir = True - print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") + logger.info("Using RIR Noise Augmentation: with %d audios instances", len(self.rir_files)) self.create_augmentation_global_list() diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index b93baf9e60..da7522a512 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -19,15 +19,19 @@ # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes """ voxceleb 1 & 2 """ +import csv import hashlib +import logging import os import subprocess import sys import zipfile -import pandas import soundfile as sf -from absl import logging + +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger + +logger = logging.getLogger(__name__) SUBSETS = { "vox1_dev_wav": [ @@ -77,14 +81,14 @@ def download_and_extract(directory, subset, urls): zip_filepath = os.path.join(directory, url.split("/")[-1]) if os.path.exists(zip_filepath): continue - logging.info("Downloading %s to %s" % (url, zip_filepath)) + logger.info("Downloading %s to %s" % (url, zip_filepath)) subprocess.call( "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath), shell=True, ) statinfo = os.stat(zip_filepath) - logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) + logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) # concatenate all parts into zip files if ".zip" not in zip_filepath: @@ -118,9 +122,9 @@ def exec_cmd(cmd): try: retcode = subprocess.call(cmd, shell=True) if retcode < 0: - logging.info(f"Child was terminated by signal {retcode}") + logger.info(f"Child was terminated by signal {retcode}") except OSError as e: - logging.info(f"Execution failed: {e}") + logger.info(f"Execution failed: {e}") retcode = -999 return retcode @@ -134,11 +138,11 @@ def decode_aac_with_ffmpeg(aac_file, wav_file): bool, True if success. """ cmd = f"ffmpeg -i {aac_file} {wav_file}" - logging.info(f"Decoding aac file using command line: {cmd}") + logger.info(f"Decoding aac file using command line: {cmd}") ret = exec_cmd(cmd) if ret != 0: - logging.error(f"Failed to decode aac file with retcode {ret}") - logging.error("Please check your ffmpeg installation.") + logger.error(f"Failed to decode aac file with retcode {ret}") + logger.error("Please check your ffmpeg installation.") return False return True @@ -152,7 +156,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv """ - logging.info("Preprocessing audio and label for subset %s" % subset) + logger.info("Preprocessing audio and label for subset %s" % subset) source_dir = os.path.join(input_dir, subset) files = [] @@ -185,9 +189,12 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". csv_file_path = os.path.join(output_dir, output_file) - df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) - df.to_csv(csv_file_path, index=False, sep="\t") - logging.info("Successfully generated csv file {}".format(csv_file_path)) + with open(csv_file_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter="\t") + writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) + for wav_file in files: + writer.writerow(wav_file) + logger.info("Successfully generated csv file {}".format(csv_file_path)) def processor(directory, subset, force_process): @@ -200,16 +207,16 @@ def processor(directory, subset, force_process): if not force_process and os.path.exists(subset_csv): return subset_csv - logging.info("Downloading and process the voxceleb in %s", directory) - logging.info("Preparing subset %s", subset) + logger.info("Downloading and process the voxceleb in %s", directory) + logger.info("Preparing subset %s", subset) download_and_extract(directory, subset, urls[subset]) convert_audio_and_make_label(directory, subset, directory, subset + ".csv") - logging.info("Finished downloading and processing") + logger.info("Finished downloading and processing") return subset_csv if __name__ == "__main__": - logging.set_verbosity(logging.INFO) + setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) if len(sys.argv) != 4: print("Usage: python prepare_data.py save_directory user password") sys.exit() diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py index ff8f271d80..cc3a78b084 100644 --- a/TTS/encoder/utils/training.py +++ b/TTS/encoder/utils/training.py @@ -3,13 +3,13 @@ from coqpit import Coqpit from trainer import TrainerArgs, get_last_checkpoint +from trainer.generic_utils import get_experiment_folder_path, get_git_branch from trainer.io import copy_model_files from trainer.logging import logger_factory from trainer.logging.console_logger import ConsoleLogger from TTS.config import load_config, register_config from TTS.tts.utils.text.characters import parse_symbols -from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch @dataclass @@ -29,7 +29,7 @@ def process_args(args, config=None): args (argparse.Namespace or dict like): Parsed input arguments. config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. Returns: - c (TTS.utils.io.AttrDict): Config paramaters. + c (Coqpit): Config paramaters. out_path (str): Path to save models and logging. audio_path (str): Path to save generated test audios. c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does diff --git a/TTS/encoder/utils/visual.py b/TTS/encoder/utils/visual.py index 6575b86ec2..bfe40605df 100644 --- a/TTS/encoder/utils/visual.py +++ b/TTS/encoder/utils/visual.py @@ -1,7 +1,6 @@ import matplotlib import matplotlib.pyplot as plt import numpy as np -import umap matplotlib.use("Agg") @@ -30,6 +29,10 @@ def plot_embeddings(embeddings, num_classes_in_batch): + try: + import umap + except ImportError as e: + raise ImportError("Package not installed: umap-learn") from e num_utter_per_class = embeddings.shape[0] // num_classes_in_batch # if necessary get just the first 10 classes diff --git a/TTS/model.py b/TTS/model.py index ae6be7b444..c3707c85ae 100644 --- a/TTS/model.py +++ b/TTS/model.py @@ -1,5 +1,6 @@ +import os from abc import abstractmethod -from typing import Dict +from typing import Any, Union import torch from coqpit import Coqpit @@ -16,7 +17,7 @@ class BaseTrainerModel(TrainerModel): @staticmethod @abstractmethod - def init_from_config(config: Coqpit): + def init_from_config(config: Coqpit) -> "BaseTrainerModel": """Init the model and all its attributes from the given config. Override this depending on your model. @@ -24,7 +25,7 @@ def init_from_config(config: Coqpit): ... @abstractmethod - def inference(self, input: torch.Tensor, aux_input={}) -> Dict: + def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict[str, Any]: """Forward pass for inference. It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs``` @@ -45,15 +46,21 @@ def inference(self, input: torch.Tensor, aux_input={}) -> Dict: @abstractmethod def load_checkpoint( - self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False + self, + config: Coqpit, + checkpoint_path: Union[str, os.PathLike[Any]], + eval: bool = False, + strict: bool = True, + cache: bool = False, ) -> None: - """Load a model checkpoint gile and get ready for training or inference. + """Load a model checkpoint file and get ready for training or inference. Args: config (Coqpit): Model configuration. - checkpoint_path (str): Path to the model checkpoint file. + checkpoint_path (str | os.PathLike): Path to the model checkpoint file. eval (bool, optional): If true, init model for inference else for training. Defaults to False. strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. - cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False. + cache (bool, optional): If True, cache the file locally for subsequent calls. + It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False. """ ... diff --git a/TTS/server/README.md b/TTS/server/README.md index 270656c4e3..ae8e38a4e3 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -1,5 +1,8 @@ # :frog: TTS demo server -Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. +Before you use the server, make sure you +[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts)) :frog: TTS +properly and install the additional dependencies with `pip install +coqui-tts[server]`. Then, you can follow the steps below. **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. @@ -12,7 +15,7 @@ Run the server with the official models. ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` Run the server with the official models on a GPU. -```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` +```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda``` Run the server with a custom models. ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` diff --git a/TTS/server/server.py b/TTS/server/server.py index 6b2141a9aa..f410fb7539 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -1,7 +1,11 @@ #!flask/bin/python + +"""TTS demo server.""" + import argparse import io import json +import logging import os import sys from pathlib import Path @@ -9,24 +13,26 @@ from typing import Union from urllib.parse import parse_qs -from flask import Flask, render_template, render_template_string, request, send_file +try: + from flask import Flask, render_template, render_template_string, request, send_file +except ImportError as e: + msg = "Server requires requires flask, use `pip install coqui-tts[server]`" + raise ImportError(msg) from e from TTS.config import load_config +from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer +logger = logging.getLogger(__name__) +setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) -def create_argparser(): - def convert_boolean(x): - return x.lower() in ["true", "1", "yes"] +def create_argparser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument( "--list_models", - type=convert_boolean, - nargs="?", - const=True, - default=False, + action="store_true", help="list available pre-trained tts and vocoder models.", ) parser.add_argument( @@ -54,9 +60,13 @@ def convert_boolean(x): parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--port", type=int, default=5002, help="port to listen on.") - parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.") - parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.") - parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.") + parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.") + parser.add_argument( + "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode." + ) + parser.add_argument( + "--show_details", action=argparse.BooleanOptionalAction, default=False, help="Generate model detail page." + ) return parser @@ -66,10 +76,6 @@ def convert_boolean(x): path = Path(__file__).parent / "../.models.json" manager = ModelManager(path) -if args.list_models: - manager.list_models() - sys.exit() - # update in-use models to the specified released models. model_path = None config_path = None @@ -164,17 +170,15 @@ def index(): def details(): if args.config_path is not None and os.path.isfile(args.config_path): model_config = load_config(args.config_path) - else: - if args.model_name is not None: - model_config = load_config(config_path) + elif args.model_name is not None: + model_config = load_config(config_path) if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path): vocoder_config = load_config(args.vocoder_config_path) + elif args.vocoder_name is not None: + vocoder_config = load_config(vocoder_config_path) else: - if args.vocoder_name is not None: - vocoder_config = load_config(vocoder_config_path) - else: - vocoder_config = None + vocoder_config = None return render_template( "details.html", @@ -197,9 +201,9 @@ def tts(): style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "") style_wav = style_wav_uri_to_dict(style_wav) - print(f" > Model input: {text}") - print(f" > Speaker Idx: {speaker_idx}") - print(f" > Language Idx: {language_idx}") + logger.info("Model input: %s", text) + logger.info("Speaker idx: %s", speaker_idx) + logger.info("Language idx: %s", language_idx) wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav) out = io.BytesIO() synthesizer.save_wav(wavs, out) @@ -243,7 +247,7 @@ def mary_tts_api_process(): text = data.get("INPUT_TEXT", [""])[0] else: text = request.args.get("INPUT_TEXT", "") - print(f" > Model input: {text}") + logger.info("Model input: %s", text) wavs = synthesizer.tts(text) out = io.BytesIO() synthesizer.save_wav(wavs, out) diff --git a/TTS/server/templates/details.html b/TTS/server/templates/details.html index 51c9ed85a8..85ff959591 100644 --- a/TTS/server/templates/details.html +++ b/TTS/server/templates/details.html @@ -128,4 +128,4 @@ - \ No newline at end of file + diff --git a/TTS/server/templates/index.html b/TTS/server/templates/index.html index 6354d3919d..6bfd5ae2cb 100644 --- a/TTS/server/templates/index.html +++ b/TTS/server/templates/index.html @@ -30,7 +30,7 @@ - Fork me on GitHub @@ -151,4 +151,4 @@ - \ No newline at end of file + diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index 4d1cd1374a..3b893558aa 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -2,11 +2,12 @@ from dataclasses import dataclass, field from typing import Dict +from trainer.io import get_user_data_dir + from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.layers.bark.model import GPTConfig from TTS.tts.layers.bark.model_fine import FineGPTConfig from TTS.tts.models.bark import BarkAudioConfig -from TTS.utils.generic_utils import get_user_data_dir @dataclass diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 192138561f..f9f2cb2e37 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -1,3 +1,4 @@ +import logging import os import sys from collections import Counter @@ -9,6 +10,8 @@ from TTS.tts.datasets.dataset import * from TTS.tts.datasets.formatters import * +logger = logging.getLogger(__name__) + def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. @@ -122,7 +125,7 @@ def load_tts_samples( meta_data_train = add_extra_keys(meta_data_train, language, dataset_name) - print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") + logger.info("Found %d files in %s", len(meta_data_train), Path(root_path).resolve()) # load evaluation split if set if eval_split: if meta_file_val: @@ -166,16 +169,15 @@ def _get_formatter_by_name(name): return getattr(thismodule, name.lower()) -def find_unique_chars(data_samples, verbose=True): - texts = "".join(item[0] for item in data_samples) +def find_unique_chars(data_samples): + texts = "".join(item["text"] for item in data_samples) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) chars_force_lower = [c.lower() for c in chars] chars_force_lower = set(chars_force_lower) - if verbose: - print(f" > Number of unique characters: {len(chars)}") - print(f" > Unique characters: {''.join(sorted(chars))}") - print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") - print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") + logger.info("Number of unique characters: %d", len(chars)) + logger.info("Unique characters: %s", "".join(sorted(chars))) + logger.info("Unique lower characters: %s", "".join(sorted(lower_chars))) + logger.info("Unique all forced to lower characters: %s", "".join(sorted(chars_force_lower))) return chars_force_lower diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index c673c963b6..37e3a1779d 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -1,11 +1,14 @@ import base64 import collections +import logging import os import random -from typing import Dict, List, Union +from typing import Any, Optional, Union import numpy as np +import numpy.typing as npt import torch +import torchaudio import tqdm from torch.utils.data import Dataset @@ -13,6 +16,8 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy +logger = logging.getLogger(__name__) + # to prevent too many open files error as suggested here # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936 torch.multiprocessing.set_sharing_strategy("file_system") @@ -28,18 +33,34 @@ def _parse_sample(item): elif len(item) == 3: text, wav_file, speaker_name = item else: - raise ValueError(" [!] Dataset cannot parse the sample.") + msg = "Dataset cannot parse the sample." + raise ValueError(msg) return text, wav_file, speaker_name, language_name, attn_file -def noise_augment_audio(wav): +def noise_augment_audio(wav: npt.NDArray) -> npt.NDArray: return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape) -def string2filename(string): +def string2filename(string: str) -> str: # generate a safe and reversible filename based on a string - filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore") - return filename + return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore") + + +def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: + """Return the number of samples in the audio file.""" + if not isinstance(audiopath, str): + audiopath = str(audiopath) + extension = audiopath.rpartition(".")[-1].lower() + if extension not in {"mp3", "wav", "flac"}: + msg = f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!" + raise RuntimeError(msg) + + try: + return torchaudio.info(audiopath).num_frames + except RuntimeError as e: + msg = f"Failed to decode {audiopath}" + raise RuntimeError(msg) from e class TTSDataset(Dataset): @@ -48,32 +69,32 @@ def __init__( outputs_per_step: int = 1, compute_linear_spec: bool = False, ap: AudioProcessor = None, - samples: List[Dict] = None, + samples: Optional[list[dict]] = None, tokenizer: "TTSTokenizer" = None, compute_f0: bool = False, compute_energy: bool = False, - f0_cache_path: str = None, - energy_cache_path: str = None, + f0_cache_path: Optional[str] = None, + energy_cache_path: Optional[str] = None, return_wav: bool = False, batch_group_size: int = 0, min_text_len: int = 0, max_text_len: int = float("inf"), min_audio_len: int = 0, max_audio_len: int = float("inf"), - phoneme_cache_path: str = None, + phoneme_cache_path: Optional[str] = None, precompute_num_workers: int = 0, - speaker_id_mapping: Dict = None, - d_vector_mapping: Dict = None, - language_id_mapping: Dict = None, + speaker_id_mapping: Optional[dict] = None, + d_vector_mapping: Optional[dict] = None, + language_id_mapping: Optional[dict] = None, use_noise_augment: bool = False, start_by_longest: bool = False, - verbose: bool = False, - ): + ) -> None: """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs. If you need something different, you can subclass and override. Args: + ---- outputs_per_step (int): Number of time frames predicted per step. compute_linear_spec (bool): compute linear spectrogram if True. @@ -126,7 +147,6 @@ def __init__( start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False. - verbose (bool): Print diagnostic information. Defaults to false. """ super().__init__() self.batch_group_size = batch_group_size @@ -150,33 +170,44 @@ def __init__( self.use_noise_augment = use_noise_augment self.start_by_longest = start_by_longest - self.verbose = verbose self.rescue_item_idx = 1 self.pitch_computed = False self.tokenizer = tokenizer if self.tokenizer.use_phonemes: self.phoneme_dataset = PhonemeDataset( - self.samples, self.tokenizer, phoneme_cache_path, precompute_num_workers=precompute_num_workers + self.samples, + self.tokenizer, + phoneme_cache_path, + precompute_num_workers=precompute_num_workers, ) if compute_f0: self.f0_dataset = F0Dataset( - self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers + self.samples, + self.ap, + cache_path=f0_cache_path, + precompute_num_workers=precompute_num_workers, ) if compute_energy: self.energy_dataset = EnergyDataset( - self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers + self.samples, + self.ap, + cache_path=energy_cache_path, + precompute_num_workers=precompute_num_workers, ) - if self.verbose: - self.print_logs() + self.print_logs() @property - def lengths(self): + def lengths(self) -> list[int]: lens = [] for item in self.samples: _, wav_file, *_ = _parse_sample(item) - audio_len = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio + try: + audio_len = get_audio_size(wav_file) + except RuntimeError: + logger.warning(f"Failed to compute length for {item['audio_file']}") + audio_len = 0 lens.append(audio_len) return lens @@ -185,7 +216,7 @@ def samples(self): return self._samples @samples.setter - def samples(self, new_samples): + def samples(self, new_samples) -> None: self._samples = new_samples if hasattr(self, "f0_dataset"): self.f0_dataset.samples = new_samples @@ -194,7 +225,7 @@ def samples(self, new_samples): if hasattr(self, "phoneme_dataset"): self.phoneme_dataset.samples = new_samples - def __len__(self): + def __len__(self) -> int: return len(self.samples) def __getitem__(self, idx): @@ -202,11 +233,10 @@ def __getitem__(self, idx): def print_logs(self, level: int = 0) -> None: indent = "\t" * level - print("\n") - print(f"{indent}> DataLoader initialization") - print(f"{indent}| > Tokenizer:") + logger.info("%sDataLoader initialization", indent) + logger.info("%s| Tokenizer:", indent) self.tokenizer.print_logs(level + 1) - print(f"{indent}| > Number of instances : {len(self.samples)}") + logger.info("%s| Number of instances : %d", indent, len(self.samples)) def load_wav(self, filename): waveform = self.ap.load_wav(filename) @@ -242,7 +272,7 @@ def get_token_ids(self, idx, text): token_ids = self.tokenizer.text_to_ids(text) return np.array(token_ids, dtype=np.int32) - def load_data(self, idx): + def load_data(self, idx) -> dict[str, Any]: item = self.samples[idx] raw_text = item["text"] @@ -276,7 +306,7 @@ def load_data(self, idx): if self.compute_energy: energy = self.get_energy(idx)["energy"] - sample = { + return { "raw_text": raw_text, "token_ids": token_ids, "wav": wav, @@ -289,13 +319,16 @@ def load_data(self, idx): "wav_file_name": os.path.basename(item["audio_file"]), "audio_unique_name": item["audio_unique_name"], } - return sample @staticmethod def _compute_lengths(samples): new_samples = [] for item in samples: - audio_length = os.path.getsize(item["audio_file"]) / 16 * 8 # assuming 16bit audio + try: + audio_length = get_audio_size(item["audio_file"]) + except RuntimeError: + logger.warning(f"Failed to compute length, skipping {item['audio_file']}") + continue text_lenght = len(item["text"]) item["audio_length"] = audio_length item["text_length"] = text_lenght @@ -303,7 +336,7 @@ def _compute_lengths(samples): return new_samples @staticmethod - def filter_by_length(lengths: List[int], min_len: int, max_len: int): + def filter_by_length(lengths: list[int], min_len: int, max_len: int): idxs = np.argsort(lengths) # ascending order ignore_idx = [] keep_idx = [] @@ -316,10 +349,9 @@ def filter_by_length(lengths: List[int], min_len: int, max_len: int): return ignore_idx, keep_idx @staticmethod - def sort_by_length(samples: List[List]): + def sort_by_length(samples: list[list]): audio_lengths = [s["audio_length"] for s in samples] - idxs = np.argsort(audio_lengths) # ascending order - return idxs + return np.argsort(audio_lengths) # ascending order @staticmethod def create_buckets(samples, batch_group_size: int): @@ -339,7 +371,7 @@ def _select_samples_by_idx(idxs, samples): samples_new.append(samples[idx]) return samples_new - def preprocess_samples(self): + def preprocess_samples(self) -> None: r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length range. """ @@ -365,7 +397,8 @@ def preprocess_samples(self): samples = self._select_samples_by_idx(sorted_idxs, samples) if len(samples) == 0: - raise RuntimeError(" [!] No samples left") + msg = "No samples left." + raise RuntimeError(msg) # shuffle batch groups # create batches with similar length items @@ -378,39 +411,38 @@ def preprocess_samples(self): text_lengths = [s["text_length"] for s in samples] self.samples = samples - if self.verbose: - print(" | > Preprocessing samples") - print(" | > Max text length: {}".format(np.max(text_lengths))) - print(" | > Min text length: {}".format(np.min(text_lengths))) - print(" | > Avg text length: {}".format(np.mean(text_lengths))) - print(" | ") - print(" | > Max audio length: {}".format(np.max(audio_lengths))) - print(" | > Min audio length: {}".format(np.min(audio_lengths))) - print(" | > Avg audio length: {}".format(np.mean(audio_lengths))) - print(f" | > Num. instances discarded samples: {len(ignore_idx)}") - print(" | > Batch group size: {}.".format(self.batch_group_size)) + logger.info("Preprocessing samples") + logger.info(f"Max text length: {np.max(text_lengths)}") + logger.info(f"Min text length: {np.min(text_lengths)}") + logger.info(f"Avg text length: {np.mean(text_lengths)}") + logger.info(f"Max audio length: {np.max(audio_lengths)}") + logger.info(f"Min audio length: {np.min(audio_lengths)}") + logger.info(f"Avg audio length: {np.mean(audio_lengths)}") + logger.info("Num. instances discarded samples: %d", len(ignore_idx)) + logger.info(f"Batch group size: {self.batch_group_size}.") @staticmethod def _sort_batch(batch, text_lengths): """Sort the batch by the input text length for RNN efficiency. Args: + ---- batch (Dict): Batch returned by `__getitem__`. text_lengths (List[int]): Lengths of the input character sequences. + """ text_lengths, ids_sorted_decreasing = torch.sort(torch.LongTensor(text_lengths), dim=0, descending=True) batch = [batch[idx] for idx in ids_sorted_decreasing] return batch, text_lengths, ids_sorted_decreasing def collate_fn(self, batch): - r""" - Perform preprocessing and create a final data batch: + """Perform preprocessing and create a final data batch. + 1. Sort batch instances by text-length 2. Convert Audio signal to features. 3. PAD sequences wrt r. 4. Load to Torch. """ - # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.abc.Mapping): token_ids_lengths = np.array([len(d["token_ids"]) for d in batch]) @@ -445,9 +477,11 @@ def collate_fn(self, batch): # lengths adjusted by the reduction factor mel_lengths_adjusted = [ - m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) - if m.shape[1] % self.outputs_per_step - else m.shape[1] + ( + m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) + if m.shape[1] % self.outputs_per_step + else m.shape[1] + ) for m in mel ] @@ -553,23 +587,18 @@ def collate_fn(self, batch): "audio_unique_names": batch["audio_unique_name"], } - raise TypeError( - ( - "batch must contain tensors, numbers, dicts or lists;\ - found {}".format( - type(batch[0]) - ) - ) - ) + msg = f"batch must contain tensors, numbers, dicts or lists; found {type(batch[0])}" + raise TypeError(msg) class PhonemeDataset(Dataset): - """Phoneme Dataset for converting input text to phonemes and then token IDs + """Phoneme Dataset for converting input text to phonemes and then token IDs. At initialization, it pre-computes the phonemes under `cache_path` and loads them in training to reduce data loading latency. If `cache_path` is already present, it skips the pre-computation. Args: + ---- samples (Union[List[List], List[Dict]]): List of samples. Each sample is a list or a dict. @@ -581,15 +610,16 @@ class PhonemeDataset(Dataset): precompute_num_workers (int): Number of workers used for pre-computing the phonemes. Defaults to 0. + """ def __init__( self, - samples: Union[List[Dict], List[List]], + samples: Union[list[dict], list[list]], tokenizer: "TTSTokenizer", cache_path: str, - precompute_num_workers=0, - ): + precompute_num_workers: int = 0, + ) -> None: self.samples = samples self.tokenizer = tokenizer self.cache_path = cache_path @@ -597,16 +627,16 @@ def __init__( os.makedirs(cache_path) self.precompute(precompute_num_workers) - def __getitem__(self, index): + def __getitem__(self, index) -> dict[str, Any]: item = self.samples[index] ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"]) ph_hat = self.tokenizer.ids_to_text(ids) return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)} - def __len__(self): + def __len__(self) -> int: return len(self.samples) - def compute_or_load(self, file_name, text, language): + def compute_or_load(self, file_name: str, text: str, language: str) -> list[int]: """Compute phonemes for the given text. If the phonemes are already cached, load them from cache. @@ -620,20 +650,24 @@ def compute_or_load(self, file_name, text, language): np.save(cache_path, ids) return ids - def get_pad_id(self): - """Get pad token ID for sequence padding""" + def get_pad_id(self) -> int: + """Get pad token ID for sequence padding.""" return self.tokenizer.pad_id - def precompute(self, num_workers=1): + def precompute(self, num_workers: int = 1) -> None: """Precompute phonemes for all samples. We use pytorch dataloader because we are lazy. """ - print("[*] Pre-computing phonemes...") + logger.info("Pre-computing phonemes...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 dataloder = torch.utils.data.DataLoader( - batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn + batch_size=batch_size, + dataset=self, + shuffle=False, + num_workers=num_workers, + collate_fn=self.collate_fn, ) for _ in dataloder: pbar.update(batch_size) @@ -651,20 +685,20 @@ def collate_fn(self, batch): def print_logs(self, level: int = 0) -> None: indent = "\t" * level - print("\n") - print(f"{indent}> PhonemeDataset ") - print(f"{indent}| > Tokenizer:") + logger.info("%sPhonemeDataset", indent) + logger.info("%s| Tokenizer:", indent) self.tokenizer.print_logs(level + 1) - print(f"{indent}| > Number of instances : {len(self.samples)}") + logger.info("%s| Number of instances : %d", indent, len(self.samples)) class F0Dataset: - """F0 Dataset for computing F0 from wav files in CPU + """F0 Dataset for computing F0 from wav files in CPU. Pre-compute F0 values for all the samples at initialization if `cache_path` is not None or already present. It also computes the mean and std of F0 values if `normalize_f0` is True. Args: + ---- samples (Union[List[List], List[Dict]]): List of samples. Each sample is a list or a dict. @@ -680,21 +714,20 @@ class F0Dataset: normalize_f0 (bool): Whether to normalize F0 values by mean and std. Defaults to True. + """ def __init__( self, - samples: Union[List[List], List[Dict]], + samples: Union[list[list], list[dict]], ap: "AudioProcessor", audio_config=None, # pylint: disable=unused-argument - verbose=False, - cache_path: str = None, - precompute_num_workers=0, - normalize_f0=True, - ): + cache_path: Optional[str] = None, + precompute_num_workers: int = 0, + normalize_f0: bool = True, + ) -> None: self.samples = samples self.ap = ap - self.verbose = verbose self.cache_path = cache_path self.normalize_f0 = normalize_f0 self.pad_id = 0.0 @@ -714,18 +747,22 @@ def __getitem__(self, idx): f0 = self.normalize(f0) return {"audio_unique_name": item["audio_unique_name"], "f0": f0} - def __len__(self): + def __len__(self) -> int: return len(self.samples) - def precompute(self, num_workers=0): - print("[*] Pre-computing F0s...") + def precompute(self, num_workers: int = 0) -> None: + logger.info("Pre-computing F0s...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 # we do not normalize at preproessing normalize_f0 = self.normalize_f0 self.normalize_f0 = False dataloder = torch.utils.data.DataLoader( - batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn + batch_size=batch_size, + dataset=self, + shuffle=False, + num_workers=num_workers, + collate_fn=self.collate_fn, ) computed_data = [] for batch in dataloder: @@ -744,9 +781,8 @@ def get_pad_id(self): return self.pad_id @staticmethod - def create_pitch_file_path(file_name, cache_path): - pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") - return pitch_file + def create_pitch_file_path(file_name: str, cache_path: str) -> str: + return os.path.join(cache_path, file_name + "_pitch.npy") @staticmethod def _compute_and_save_pitch(ap, wav_file, pitch_file=None): @@ -762,7 +798,7 @@ def compute_pitch_stats(pitch_vecs): mean, std = np.mean(nonzeros), np.std(nonzeros) return mean, std - def load_stats(self, cache_path): + def load_stats(self, cache_path) -> None: stats_path = os.path.join(cache_path, "pitch_stats.npy") stats = np.load(stats_path, allow_pickle=True).item() self.mean = stats["mean"].astype(np.float32) @@ -783,9 +819,7 @@ def denormalize(self, pitch): return pitch def compute_or_load(self, wav_file, audio_unique_name): - """ - compute pitch and return a numpy array of pitch values - """ + """Compute pitch and return a numpy array of pitch values.""" pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path) if not os.path.exists(pitch_file): pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file) @@ -805,18 +839,18 @@ def collate_fn(self, batch): def print_logs(self, level: int = 0) -> None: indent = "\t" * level - print("\n") - print(f"{indent}> F0Dataset ") - print(f"{indent}| > Number of instances : {len(self.samples)}") + logger.info("%sF0Dataset", indent) + logger.info("%s| Number of instances : %d", indent, len(self.samples)) class EnergyDataset: - """Energy Dataset for computing Energy from wav files in CPU + """Energy Dataset for computing Energy from wav files in CPU. Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It also computes the mean and std of Energy values if `normalize_Energy` is True. Args: + ---- samples (Union[List[List], List[Dict]]): List of samples. Each sample is a list or a dict. @@ -832,20 +866,19 @@ class EnergyDataset: normalize_Energy (bool): Whether to normalize Energy values by mean and std. Defaults to True. + """ def __init__( self, - samples: Union[List[List], List[Dict]], + samples: Union[list[list], list[dict]], ap: "AudioProcessor", - verbose=False, - cache_path: str = None, + cache_path: Optional[str] = None, precompute_num_workers=0, normalize_energy=True, - ): + ) -> None: self.samples = samples self.ap = ap - self.verbose = verbose self.cache_path = cache_path self.normalize_energy = normalize_energy self.pad_id = 0.0 @@ -865,18 +898,22 @@ def __getitem__(self, idx): energy = self.normalize(energy) return {"audio_unique_name": item["audio_unique_name"], "energy": energy} - def __len__(self): + def __len__(self) -> int: return len(self.samples) - def precompute(self, num_workers=0): - print("[*] Pre-computing energys...") + def precompute(self, num_workers=0) -> None: + logger.info("Pre-computing energys...") with tqdm.tqdm(total=len(self)) as pbar: batch_size = num_workers if num_workers > 0 else 1 # we do not normalize at preproessing normalize_energy = self.normalize_energy self.normalize_energy = False dataloder = torch.utils.data.DataLoader( - batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn + batch_size=batch_size, + dataset=self, + shuffle=False, + num_workers=num_workers, + collate_fn=self.collate_fn, ) computed_data = [] for batch in dataloder: @@ -897,8 +934,7 @@ def get_pad_id(self): @staticmethod def create_energy_file_path(wav_file, cache_path): file_name = os.path.splitext(os.path.basename(wav_file))[0] - energy_file = os.path.join(cache_path, file_name + "_energy.npy") - return energy_file + return os.path.join(cache_path, file_name + "_energy.npy") @staticmethod def _compute_and_save_energy(ap, wav_file, energy_file=None): @@ -914,7 +950,7 @@ def compute_energy_stats(energy_vecs): mean, std = np.mean(nonzeros), np.std(nonzeros) return mean, std - def load_stats(self, cache_path): + def load_stats(self, cache_path) -> None: stats_path = os.path.join(cache_path, "energy_stats.npy") stats = np.load(stats_path, allow_pickle=True).item() self.mean = stats["mean"].astype(np.float32) @@ -935,9 +971,7 @@ def denormalize(self, energy): return energy def compute_or_load(self, wav_file, audio_unique_name): - """ - compute energy and return a numpy array of energy values - """ + """Compute energy and return a numpy array of energy values.""" energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path) if not os.path.exists(energy_file): energy = self._compute_and_save_energy(self.ap, wav_file, energy_file) @@ -957,6 +991,5 @@ def collate_fn(self, batch): def print_logs(self, level: int = 0) -> None: indent = "\t" * level - print("\n") - print(f"{indent}> energyDataset ") - print(f"{indent}| > Number of instances : {len(self.samples)}") + logger.info("%senergyDataset") + logger.info("%s| Number of instances : %d", indent, len(self.samples)) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 053444b0c1..ff1a76e2c9 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -1,3 +1,5 @@ +import csv +import logging import os import re import xml.etree.ElementTree as ET @@ -5,9 +7,10 @@ from pathlib import Path from typing import List -import pandas as pd from tqdm import tqdm +logger = logging.getLogger(__name__) + ######################## # DATASETS ######################## @@ -23,32 +26,34 @@ def cml_tts(root_path, meta_file, ignored_speakers=None): num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): if len(line.split("|")) != num_cols: - print(f" > Missing column in line {idx + 1} -> {line.strip()}") + logger.warning("Missing column in line %d -> %s", idx + 1, line.strip()) # load metadata - metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") - assert all(x in metadata.columns for x in ["wav_filename", "transcript"]) - client_id = None if "client_id" in metadata.columns else "default" - emotion_name = None if "emotion_name" in metadata.columns else "neutral" + with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="|") + metadata = list(reader) + assert all(x in metadata[0] for x in ["wav_filename", "transcript"]) + client_id = None if "client_id" in metadata[0] else "default" + emotion_name = None if "emotion_name" in metadata[0] else "neutral" items = [] not_found_counter = 0 - for row in metadata.itertuples(): - if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers: + for row in metadata: + if client_id is None and ignored_speakers is not None and row["client_id"] in ignored_speakers: continue - audio_path = os.path.join(root_path, row.wav_filename) + audio_path = os.path.join(root_path, row["wav_filename"]) if not os.path.exists(audio_path): not_found_counter += 1 continue items.append( { - "text": row.transcript, + "text": row["transcript"], "audio_file": audio_path, - "speaker_name": client_id if client_id is not None else row.client_id, - "emotion_name": emotion_name if emotion_name is not None else row.emotion_name, + "speaker_name": client_id if client_id is not None else row["client_id"], + "emotion_name": emotion_name if emotion_name is not None else row["emotion_name"], "root_path": root_path, } ) if not_found_counter > 0: - print(f" | > [!] {not_found_counter} files not found") + logger.warning("%d files not found", not_found_counter) return items @@ -61,32 +66,34 @@ def coqui(root_path, meta_file, ignored_speakers=None): num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): if len(line.split("|")) != num_cols: - print(f" > Missing column in line {idx + 1} -> {line.strip()}") + logger.warning("Missing column in line %d -> %s", idx + 1, line.strip()) # load metadata - metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") - assert all(x in metadata.columns for x in ["audio_file", "text"]) - speaker_name = None if "speaker_name" in metadata.columns else "coqui" - emotion_name = None if "emotion_name" in metadata.columns else "neutral" + with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="|") + metadata = list(reader) + assert all(x in metadata[0] for x in ["audio_file", "text"]) + speaker_name = None if "speaker_name" in metadata[0] else "coqui" + emotion_name = None if "emotion_name" in metadata[0] else "neutral" items = [] not_found_counter = 0 - for row in metadata.itertuples(): - if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers: + for row in metadata: + if speaker_name is None and ignored_speakers is not None and row["speaker_name"] in ignored_speakers: continue - audio_path = os.path.join(root_path, row.audio_file) + audio_path = os.path.join(root_path, row["audio_file"]) if not os.path.exists(audio_path): not_found_counter += 1 continue items.append( { - "text": row.text, + "text": row["text"], "audio_file": audio_path, - "speaker_name": speaker_name if speaker_name is not None else row.speaker_name, - "emotion_name": emotion_name if emotion_name is not None else row.emotion_name, + "speaker_name": speaker_name if speaker_name is not None else row["speaker_name"], + "emotion_name": emotion_name if emotion_name is not None else row["emotion_name"], "root_path": root_path, } ) if not_found_counter > 0: - print(f" | > [!] {not_found_counter} files not found") + logger.warning("%d files not found", not_found_counter) return items @@ -169,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None): if isinstance(ignored_speakers, list): if speaker_name in ignored_speakers: continue - print(" | > {}".format(csv_file)) + logger.info(csv_file) with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") @@ -184,7 +191,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None): ) else: # M-AI-Labs have some missing samples, so just print the warning - print("> File %s does not exist!" % (wav_file)) + logger.warning("File %s does not exist!", wav_file) return items @@ -249,7 +256,7 @@ def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-arg text = item.text wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav") if not os.path.exists(wav_file): - print(f" [!] {wav_file} in metafile does not exist. Skipping...") + logger.warning("%s in metafile does not exist. Skipping...", wav_file) continue items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items @@ -370,7 +377,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar continue text = cols[1].strip() items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) - print(f" [!] {len(skipped_files)} files skipped. They don't exist...") + logger.warning("%d files skipped. They don't exist...") return items @@ -438,7 +445,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic {"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path} ) else: - print(f" [!] wav files don't exist - {wav_file}") + logger.warning("Wav file doesn't exist - %s", wav_file) return items diff --git a/TTS/tts/layers/bark/hubert/hubert_manager.py b/TTS/tts/layers/bark/hubert/hubert_manager.py index 4bc1992941..fd936a9157 100644 --- a/TTS/tts/layers/bark/hubert/hubert_manager.py +++ b/TTS/tts/layers/bark/hubert/hubert_manager.py @@ -1,11 +1,14 @@ # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer +import logging import os.path import shutil import urllib.request import huggingface_hub +logger = logging.getLogger(__name__) + class HubertManager: @staticmethod @@ -13,9 +16,9 @@ def make_sure_hubert_installed( download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = "" ): if not os.path.isfile(model_path): - print("Downloading HuBERT base model") + logger.info("Downloading HuBERT base model") urllib.request.urlretrieve(download_url, model_path) - print("Downloaded HuBERT") + logger.info("Downloaded HuBERT") return model_path return None @@ -27,9 +30,9 @@ def make_sure_tokenizer_installed( ): model_dir = os.path.dirname(model_path) if not os.path.isfile(model_path): - print("Downloading HuBERT custom tokenizer") + logger.info("Downloading HuBERT custom tokenizer") huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False) shutil.move(os.path.join(model_dir, model), model_path) - print("Downloaded tokenizer") + logger.info("Downloaded tokenizer") return model_path return None diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index a6a3b9aeb1..9e487b1e9d 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -7,8 +7,6 @@ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py -import logging -from pathlib import Path import torch from einops import pack, unpack diff --git a/TTS/tts/layers/bark/hubert/tokenizer.py b/TTS/tts/layers/bark/hubert/tokenizer.py index 3070241f1c..cd9579799a 100644 --- a/TTS/tts/layers/bark/hubert/tokenizer.py +++ b/TTS/tts/layers/bark/hubert/tokenizer.py @@ -5,6 +5,7 @@ """ import json +import logging import os.path from zipfile import ZipFile @@ -12,6 +13,8 @@ import torch from torch import nn, optim +logger = logging.getLogger(__name__) + class HubertTokenizer(nn.Module): def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0): @@ -85,7 +88,7 @@ def train_step(self, x_train, y_train, log_loss=False): # Print loss if log_loss: - print("Loss", loss.item()) + logger.info("Loss %.3f", loss.item()) # Backward pass loss.backward() @@ -157,10 +160,10 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep data_x, data_y = [], [] if load_model and os.path.isfile(load_model): - print("Loading model from", load_model) + logger.info("Loading model from %s", load_model) model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda") else: - print("Creating new model.") + logger.info("Creating new model.") model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm save_path = os.path.join(data_path, save_path) base_save_path = ".".join(save_path.split(".")[:-1]) @@ -191,5 +194,5 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep save_p_2 = f"{base_save_path}_epoch_{epoch}.pth" model_training.save(save_p) model_training.save(save_p_2) - print(f"Epoch {epoch} completed") + logger.info("Epoch %d completed", epoch) epoch += 1 diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index f3d3fee937..b2875c7a83 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -2,10 +2,11 @@ import os import re from glob import glob -from typing import Dict, List +from typing import Dict, List, Optional, Tuple import librosa import numpy as np +import numpy.typing as npt import torch import torchaudio import tqdm @@ -48,7 +49,7 @@ def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-d return voices -def load_npz(npz_file): +def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: x_history = np.load(npz_file) semantic = x_history["semantic_prompt"] coarse = x_history["coarse_prompt"] @@ -56,7 +57,11 @@ def load_npz(npz_file): return semantic, coarse, fine -def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value +def load_voice( + model, voice: str, extra_voice_dirs: List[str] = [] +) -> Tuple[ + Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]] +]: # pylint: disable=dangerous-default-value if voice == "random": return None, None, None @@ -107,11 +112,10 @@ def generate_voice( model, output_path, ): - """Generate a new voice from a given audio and text prompt. + """Generate a new voice from a given audio. Args: audio (np.ndarray): The audio to use as a base for the new voice. - text (str): Transcription of the audio you are clonning. model (BarkModel): The BarkModel to use for generating the new voice. output_path (str): The path to save the generated voice to. """ diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index c84022bd08..68c50dbdbd 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -2,6 +2,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT (https://github.com/karpathy/nanoGPT) """ + import math from dataclasses import dataclass diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 09e5f4765d..29126b41ab 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -2,6 +2,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT (https://github.com/karpathy/nanoGPT) """ + import math from dataclasses import dataclass diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index c906b882e5..83989f9ba4 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -1,4 +1,5 @@ ### credit: https://github.com/dunky11/voicesmith +import logging from typing import Callable, Dict, Tuple import torch @@ -20,6 +21,8 @@ from TTS.tts.layers.generic.aligner import AlignmentNetwork from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +logger = logging.getLogger(__name__) + class AcousticModel(torch.nn.Module): def __init__( @@ -217,7 +220,7 @@ def _set_speaker_input(self, aux_input: Dict): def _init_speaker_embedding(self): # pylint: disable=attribute-defined-outside-init if self.num_speakers > 0: - print(" > initialization of speaker-embedding layers.") + logger.info("Initialization of speaker-embedding layers.") self.embedded_speaker_dim = self.args.speaker_embedding_channels self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) @@ -362,7 +365,7 @@ def forward( pos_encoding = positional_encoding( self.emb_dim, - max(token_embeddings.shape[1], max(mel_lens)), + max(token_embeddings.shape[1], *mel_lens), device=token_embeddings.device, ) encoder_outputs = self.encoder( diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index b02c311808..77a796473b 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -1,5 +1,4 @@ import torch -from packaging.version import Version from torch import nn from torch.nn import functional as F @@ -90,10 +89,7 @@ def __init__(self, channels, num_splits=4, no_jacobian=False, **kwargs): # pyli self.no_jacobian = no_jacobian self.weight_inv = None - if Version(torch.__version__) < Version("1.9"): - w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] - else: - w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0] + w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0] if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] diff --git a/TTS/tts/layers/glow_tts/transformer.py b/TTS/tts/layers/glow_tts/transformer.py index 02688d611f..c97d070a95 100644 --- a/TTS/tts/layers/glow_tts/transformer.py +++ b/TTS/tts/layers/glow_tts/transformer.py @@ -5,6 +5,7 @@ from torch.nn import functional as F from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2 +from TTS.tts.utils.helpers import convert_pad_shape class RelativePositionMultiHeadAttention(nn.Module): @@ -300,7 +301,7 @@ def _causal_padding(self, x): pad_l = self.kernel_size - 1 pad_r = 0 padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, self._pad_shape(padding)) + x = F.pad(x, convert_pad_shape(padding)) return x def _same_padding(self, x): @@ -309,15 +310,9 @@ def _same_padding(self, x): pad_l = (self.kernel_size - 1) // 2 pad_r = self.kernel_size // 2 padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, self._pad_shape(padding)) + x = F.pad(x, convert_pad_shape(padding)) return x - @staticmethod - def _pad_shape(padding): - l = padding[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - class RelativePositionTransformer(nn.Module): """Transformer with Relative Potional Encoding. diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index de5f408c48..5ebed81dda 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -1,3 +1,4 @@ +import logging import math import numpy as np @@ -10,6 +11,8 @@ from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss from TTS.utils.audio.torch_transforms import TorchSTFT +logger = logging.getLogger(__name__) + # pylint: disable=abstract-method # relates https://github.com/pytorch/pytorch/issues/42305 @@ -132,11 +135,11 @@ def forward(self, y_hat, y, length): ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1)) if ssim_loss.item() > 1.0: - print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 1.0") + logger.info("SSIM loss is out-of-range (%.2f), setting it to 1.0", ssim_loss.item()) ssim_loss = torch.tensor(1.0, device=ssim_loss.device) if ssim_loss.item() < 0.0: - print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 0.0") + logger.info("SSIM loss is out-of-range (%.2f), setting it to 0.0", ssim_loss.item()) ssim_loss = torch.tensor(0.0, device=ssim_loss.device) return ssim_loss @@ -252,7 +255,7 @@ def forward(self, att_ws, ilens, olens): @staticmethod def _make_ga_mask(ilen, olen, sigma): - grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen)) + grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen), indexing="ij") grid_x, grid_y = grid_x.float(), grid_y.float() return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2))) diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py index b036dd1bda..9f77af293c 100644 --- a/TTS/tts/layers/overflow/common_layers.py +++ b/TTS/tts/layers/overflow/common_layers.py @@ -1,3 +1,4 @@ +import logging from typing import List, Tuple import torch @@ -8,6 +9,8 @@ from TTS.tts.layers.tacotron.common_layers import Linear from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock +logger = logging.getLogger(__name__) + class Encoder(nn.Module): r"""Neural HMM Encoder @@ -213,8 +216,8 @@ def _floor_std(self, std): original_tensor = std.clone().detach() std = torch.clamp(std, min=self.std_floor) if torch.any(original_tensor != std): - print( - "[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about" + logger.info( + "Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about" ) return std diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py index 0631ba98c0..a12becef03 100644 --- a/TTS/tts/layers/overflow/neural_hmm.py +++ b/TTS/tts/layers/overflow/neural_hmm.py @@ -128,7 +128,8 @@ def forward(self, inputs, inputs_len, mels, mel_lens): # Get mean, std and transition vector from decoder for this timestep # Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop if self.use_grad_checkpointing and self.training: - mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs) + # TODO: use_reentrant=False is recommended + mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs, use_reentrant=True) else: mean, std, transition_vector = self.output_net(h_memory, inputs) diff --git a/TTS/tts/layers/overflow/plotting_utils.py b/TTS/tts/layers/overflow/plotting_utils.py index a63aeb370a..d9d3e3d141 100644 --- a/TTS/tts/layers/overflow/plotting_utils.py +++ b/TTS/tts/layers/overflow/plotting_utils.py @@ -71,7 +71,7 @@ def plot_transition_probabilities_to_numpy(states, transition_probabilities, out ax.set_title("Transition probability of state") ax.set_xlabel("hidden state") ax.set_ylabel("probability") - ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension + ax.set_xticks(list(range(len(transition_probabilities)))) ax.set_xticklabels([int(x) for x in states], rotation=90) plt.tight_layout() if not output_fig: diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index 7a47c35ef6..32643dfcee 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -1,12 +1,16 @@ # coding: utf-8 # adapted from https://github.com/r9y9/tacotron_pytorch +import logging + import torch from torch import nn from .attentions import init_attn from .common_layers import Prenet +logger = logging.getLogger(__name__) + class BatchNormConv1d(nn.Module): r"""A wrapper for Conv1d with BatchNorm. It sets the activation @@ -480,7 +484,7 @@ def inference(self, inputs): if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6): break if t > self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") + logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps) break return self._parse_outputs(outputs, attentions, stop_tokens) diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index c79b709972..727bf9ecfd 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -1,3 +1,5 @@ +import logging + import torch from torch import nn from torch.nn import functional as F @@ -5,6 +7,8 @@ from .attentions import init_attn from .common_layers import Linear, Prenet +logger = logging.getLogger(__name__) + # pylint: disable=no-value-for-parameter # pylint: disable=unexpected-keyword-arg @@ -356,7 +360,7 @@ def inference(self, inputs): if stop_token > self.stop_threshold and t > inputs.shape[0] // 2: break if len(outputs) == self.max_decoder_steps: - print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}") + logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps) break memory = self._update_memory(decoder_output) @@ -389,7 +393,7 @@ def inference_truncated(self, inputs): if stop_token > 0.7: break if len(outputs) == self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") + logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps) break self.memory_truncated = decoder_output diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index dad1814369..c79ef31b0c 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -1,6 +1,5 @@ import functools import math -import os import fsspec import torch diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 70711ed7a4..0b8701227b 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -1,3 +1,4 @@ +import logging import os from glob import glob from typing import Dict, List @@ -10,6 +11,8 @@ from TTS.utils.audio.torch_transforms import TorchSTFT +logger = logging.getLogger(__name__) + def load_wav_to_torch(full_path): sampling_rate, data = read(full_path) @@ -28,7 +31,7 @@ def check_audio(audio, audiopath: str): # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. if torch.any(audio > 2) or not torch.any(audio < 0): - print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + logger.error("Error with %s. Max=%.2f min=%.2f", audiopath, audio.max(), audio.min()) audio.clip_(-1, 1) @@ -136,7 +139,7 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []): for voice in voices: if voice == "random": if len(voices) > 1: - print("Cannot combine a random voice with a non-random voice. Just using a random voice.") + logger.warning("Cannot combine a random voice with a non-random voice. Just using a random voice.") return None, None clip, latent = load_voice(voice, extra_voice_dirs) if latent is None: diff --git a/TTS/tts/layers/tortoise/clvp.py b/TTS/tts/layers/tortoise/clvp.py index 69b8c17c3f..241dfdd4f4 100644 --- a/TTS/tts/layers/tortoise/clvp.py +++ b/TTS/tts/layers/tortoise/clvp.py @@ -126,7 +126,7 @@ def forward(self, text, speech_tokens, return_loss=False): text_latents = self.to_text_latent(text_latents) speech_latents = self.to_speech_latent(speech_latents) - text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents)) + text_latents, speech_latents = (F.normalize(t, p=2, dim=-1) for t in (text_latents, speech_latents)) temp = self.temperature.exp() diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index 7bea02ca08..2b29091b44 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -972,7 +972,7 @@ def autoregressive_training_losses( assert False # not currently supported for this type of diffusion. elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) - terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) + terms.update(dict(zip(model_output_keys, model_outputs))) model_output = terms[gd_out_key] if self.model_var_type in [ ModelVarType.LEARNED, diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index c70888df42..6a1d8ff784 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -1,7 +1,10 @@ +import logging import math import torch +logger = logging.getLogger(__name__) + class NoiseScheduleVP: def __init__( @@ -1171,7 +1174,7 @@ def norm_fn(v): lambda_0 - lambda_s, ) nfe += order - print("adaptive solver nfe", nfe) + logger.debug("adaptive solver nfe %d", nfe) return x def add_noise(self, x, t, noise=None): diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index 70d46aa3e0..6cb1bab96a 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -37,7 +37,7 @@ def route_args(router, args, depth): for key in matched_keys: val = args[key] for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])): - new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes) + new_f_args, new_g_args = (({key: val} if route else {}) for route in routes) routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args}) return routed_args @@ -152,7 +152,7 @@ def forward(self, x, mask=None): softmax = torch.softmax qkv = self.to_qkv(x).chunk(3, dim=-1) - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv) + q, k, v = (rearrange(t, "b n (h d) -> b h n d", h=h) for t in qkv) q = q * self.scale diff --git a/TTS/tts/layers/tortoise/utils.py b/TTS/tts/layers/tortoise/utils.py index 810a9e7f7a..898121f793 100644 --- a/TTS/tts/layers/tortoise/utils.py +++ b/TTS/tts/layers/tortoise/utils.py @@ -1,8 +1,11 @@ +import logging import os from urllib import request from tqdm import tqdm +logger = logging.getLogger(__name__) + DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models") MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR) MODELS_DIR = "/data/speech_synth/models/" @@ -28,10 +31,10 @@ def download_models(specific_models=None): model_path = os.path.join(MODELS_DIR, model_name) if os.path.exists(model_path): continue - print(f"Downloading {model_name} from {url}...") + logger.info("Downloading %s from %s...", model_name, url) with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t: request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n)) - print("Done.") + logger.info("Done.") def get_model_path(model_name, models_dir=MODELS_DIR): diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 1eb3f77269..9325b8c720 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -84,7 +84,7 @@ def init_zero_(layer): def pick_and_pop(keys, d): - values = list(map(lambda key: d.pop(key), keys)) + values = [d.pop(key) for key in keys] return dict(zip(keys, values)) @@ -107,7 +107,7 @@ def group_by_key_prefix(prefix, d): def groupby_prefix_and_trim(prefix, d): kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) - kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix) :], x[1]), tuple(kwargs_with_prefix.items()))) + kwargs_without_prefix = {x[0][len(prefix) :]: x[1] for x in tuple(kwargs_with_prefix.items())} return kwargs_without_prefix, kwargs @@ -428,7 +428,7 @@ def forward(self, x, **kwargs): feats_per_shift = x.shape[-1] // segments splitted = x.split(feats_per_shift, dim=-1) segments_to_shift, rest = splitted[:segments], splitted[segments:] - segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts))) + segments_to_shift = [shift(*args, mask=mask) for args in zip(segments_to_shift, shifts)] x = torch.cat((*segments_to_shift, *rest), dim=-1) return self.fn(x, **kwargs) @@ -635,7 +635,7 @@ def forward( v = self.to_v(v_input) if not collab_heads: - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + q, k, v = (rearrange(t, "b n (h d) -> b h n d", h=h) for t in (q, k, v)) else: q = einsum("b i d, h d -> b h i d", q, self.collab_mixing) k = rearrange(k, "b n d -> b () n d") @@ -650,9 +650,9 @@ def forward( if exists(rotary_pos_emb) and not has_context: l = rotary_pos_emb.shape[-1] - (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v)) - ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl)) - q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr))) + (ql, qr), (kl, kr), (vl, vr) = ((t[..., :l], t[..., l:]) for t in (q, k, v)) + ql, kl, vl = (apply_rotary_pos_emb(t, rotary_pos_emb) for t in (ql, kl, vl)) + q, k, v = (torch.cat(t, dim=-1) for t in ((ql, qr), (kl, kr), (vl, vr))) input_mask = None if any(map(exists, (mask, context_mask))): @@ -664,7 +664,7 @@ def forward( input_mask = q_mask * k_mask if self.num_mem_kv > 0: - mem_k, mem_v = map(lambda t: repeat(t, "h n d -> b h n d", b=b), (self.mem_k, self.mem_v)) + mem_k, mem_v = (repeat(t, "h n d -> b h n d", b=b) for t in (self.mem_k, self.mem_v)) k = torch.cat((mem_k, k), dim=-2) v = torch.cat((mem_v, v), dim=-2) if exists(input_mask): @@ -964,9 +964,7 @@ def forward( seq_len = x.shape[1] if past_key_values is not None: seq_len += past_key_values[0][0].shape[-2] - max_rotary_emb_length = max( - list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len] - ) + max_rotary_emb_length = max([(m.shape[1] if exists(m) else 0) + seq_len for m in mems] + [expected_seq_len]) rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device) present_key_values = [] @@ -1200,7 +1198,7 @@ def forward( res = [out] if return_attn: - attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates] res.append(attn_maps) if use_cache: res.append(intermediates.past_key_values) @@ -1249,7 +1247,7 @@ def forward(self, x, return_embeddings=False, mask=None, return_attn=False, mems res = [out] if return_attn: - attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates] res.append(attn_maps) if use_cache: res.append(intermediates.past_key_values) diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index c27d11bef6..3449739fdc 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -2,7 +2,7 @@ from torch import nn from torch.nn.modules.conv import Conv1d -from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP, MultiPeriodDiscriminator +from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP class DiscriminatorS(torch.nn.Module): diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index f97b584fe6..50ed1024de 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -10,22 +10,6 @@ LRELU_SLOPE = 0.1 -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - - class TextEncoder(nn.Module): def __init__( self, diff --git a/TTS/tts/layers/xtts/__init__.py b/TTS/tts/layers/xtts/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py index bdd7a9d09f..4a37307e74 100644 --- a/TTS/tts/layers/xtts/dvae.py +++ b/TTS/tts/layers/xtts/dvae.py @@ -1,4 +1,5 @@ import functools +import logging from math import sqrt import torch @@ -8,6 +9,8 @@ import torchaudio from einops import rearrange +logger = logging.getLogger(__name__) + def default(val, d): return val if val is not None else d @@ -79,7 +82,7 @@ def forward(self, input, return_soft_codes=False): self.embed_avg = (ea * ~mask + rand_embed).permute(1, 0) self.cluster_size = self.cluster_size * ~mask.squeeze() if torch.any(mask): - print(f"Reset {torch.sum(mask)} embedding codes.") + logger.info("Reset %d embedding codes.", torch.sum(mask)) self.codes = None self.codes_full = False @@ -260,7 +263,7 @@ def __init__( dec_init_chan = codebook_dim if not has_resblocks else dec_chans[0] dec_chans = [dec_init_chan, *dec_chans] - enc_chans_io, dec_chans_io = map(lambda t: list(zip(t[:-1], t[1:])), (enc_chans, dec_chans)) + enc_chans_io, dec_chans_io = (list(zip(t[:-1], t[1:])) for t in (enc_chans, dec_chans)) pad = (kernel_size - 1) // 2 for (enc_in, enc_out), (dec_in, dec_out) in zip(enc_chans_io, dec_chans_io): @@ -306,9 +309,9 @@ def norm(self, images): if not self.normalization is not None: return images - means, stds = map(lambda t: torch.as_tensor(t).to(images), self.normalization) + means, stds = (torch.as_tensor(t).to(images) for t in self.normalization) arrange = "c -> () c () ()" if self.positional_dims == 2 else "c -> () c ()" - means, stds = map(lambda t: rearrange(t, arrange), (means, stds)) + means, stds = (rearrange(t, arrange) for t in (means, stds)) images = images.clone() images.sub_(means).div_(stds) return images diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index e7b186b858..b55b84d90e 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -1,7 +1,6 @@ # ported from: https://github.com/neonbjb/tortoise-tts import functools -import math import random import torch @@ -188,9 +187,9 @@ def __init__( def get_grad_norm_parameter_groups(self): return { "conditioning_encoder": list(self.conditioning_encoder.parameters()), - "conditioning_perceiver": list(self.conditioning_perceiver.parameters()) - if self.use_perceiver_resampler - else None, + "conditioning_perceiver": ( + list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None + ), "gpt": list(self.gpt.parameters()), "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()), } diff --git a/TTS/tts/layers/xtts/gpt_inference.py b/TTS/tts/layers/xtts/gpt_inference.py index d44bd3decd..4625ae1ba9 100644 --- a/TTS/tts/layers/xtts/gpt_inference.py +++ b/TTS/tts/layers/xtts/gpt_inference.py @@ -1,5 +1,3 @@ -import math - import torch from torch import nn from transformers import GPT2PreTrainedModel diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 9add7826e6..b6032e5584 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -1,3 +1,5 @@ +import logging + import torch import torchaudio from torch import nn @@ -5,14 +7,13 @@ from torch.nn import functional as F from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations +from trainer.io import load_fsspec -from TTS.utils.io import load_fsspec - -LRELU_SLOPE = 0.1 +from TTS.vocoder.models.hifigan_generator import get_padding +logger = logging.getLogger(__name__) -def get_padding(k, d): - return int((k * d - d) / 2) +LRELU_SLOPE = 0.1 class ResBlock1(torch.nn.Module): @@ -316,7 +317,7 @@ def inference(self, c): return self.forward(c) def remove_weight_norm(self): - print("Removing weight norm...") + logger.info("Removing weight norm...") for l in self.ups: remove_parametrizations(l, "weight") for l in self.resblocks: @@ -390,7 +391,7 @@ def set_init_dict(model_dict, checkpoint_state, c): # Partial initialization: if there is a mismatch with new and old layer, it is skipped. for k, v in checkpoint_state.items(): if k not in model_dict: - print(" | > Layer missing in the model definition: {}".format(k)) + logger.warning("Layer missing in the model definition: %s", k) # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} # 2. filter out different size layers @@ -401,7 +402,7 @@ def set_init_dict(model_dict, checkpoint_state, c): pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} # 4. overwrite entries in the existing state dict model_dict.update(pretrained_dict) - print(" | > {} / {} layers are restored.".format(len(pretrained_dict), len(model_dict))) + logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict)) return model_dict @@ -579,13 +580,13 @@ def load_checkpoint( state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) try: self.load_state_dict(state["model"]) - print(" > Model fully restored. ") + logger.info("Model fully restored.") except (KeyError, RuntimeError) as error: # If eval raise the error if eval: raise error - print(" > Partial model initialization.") + logger.info("Partial model initialization.") model_dict = self.state_dict() model_dict = set_init_dict(model_dict, state["model"]) self.load_state_dict(model_dict) @@ -596,7 +597,7 @@ def load_checkpoint( try: criterion.load_state_dict(state["criterion"]) except (KeyError, RuntimeError) as error: - print(" > Criterion load ignored because of:", error) + logger.exception("Criterion load ignored because of: %s", error) if use_cuda: self.cuda() diff --git a/TTS/tts/layers/xtts/perceiver_encoder.py b/TTS/tts/layers/xtts/perceiver_encoder.py index 7b7ee79b50..f4b6e84123 100644 --- a/TTS/tts/layers/xtts/perceiver_encoder.py +++ b/TTS/tts/layers/xtts/perceiver_encoder.py @@ -7,7 +7,6 @@ import torch.nn.functional as F from einops import rearrange, repeat from einops.layers.torch import Rearrange -from packaging import version from torch import einsum, nn @@ -44,9 +43,6 @@ def __init__(self, dropout=0.0, causal=False, use_flash=False): self.register_buffer("mask", None, persistent=False) self.use_flash = use_flash - assert not ( - use_flash and version.parse(torch.__version__) < version.parse("2.0.0") - ), "in order to use flash attention, you must be using pytorch 2.0 or above" # determine efficient attention configs for cuda and cpu self.config = namedtuple("EfficientAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"]) @@ -155,10 +151,6 @@ def Sequential(*mods): return nn.Sequential(*filter(exists, mods)) -def exists(x): - return x is not None - - def default(val, d): if exists(val): return val diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index e12f8995cf..efc92a04ef 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -4,7 +4,7 @@ import inspect import random import warnings -from typing import Callable, List, Optional, Union +from typing import Callable, Optional, Union import numpy as np import torch @@ -21,10 +21,11 @@ PreTrainedModel, StoppingCriteriaList, ) +from transformers.generation.stopping_criteria import validate_stopping_criteria from transformers.generation.utils import GenerateOutput, SampleOutput, logger -def setup_seed(seed): +def setup_seed(seed: int) -> None: if seed == -1: return torch.manual_seed(seed) @@ -43,15 +44,15 @@ def __init__(self, **kwargs): class NewGenerationMixin(GenerationMixin): @torch.no_grad() - def generate( + def generate( # noqa: PLR0911 self, inputs: Optional[torch.Tensor] = None, generation_config: Optional[StreamGenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, synced_gpus: Optional[bool] = False, - seed=0, + seed: int = 0, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: r""" @@ -90,7 +91,7 @@ def generate( Custom stopping criteria that complement the default stopping criteria built from arguments and a generation config. If a stopping criteria is passed that is already created with the arguments or a generation config an error is thrown. This feature is intended for advanced users. - prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*): + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*): If provided, this function constraints the beam search to allowed tokens only at each step. If not provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned @@ -151,18 +152,7 @@ def generate( # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - - if generation_config.pad_token_id is None and generation_config.eos_token_id is not None: - if model_kwargs.get("attention_mask", None) is None: - logger.warning( - "The attention mask and the pad token id were not set. As a consequence, you may observe " - "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results." - ) - eos_token_id = generation_config.eos_token_id - if isinstance(eos_token_id, list): - eos_token_id = eos_token_id[0] - logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") - generation_config.pad_token_id = eos_token_id + kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None # 3. Define model inputs # inputs_tensor has to be defined @@ -174,6 +164,9 @@ def generate( ) batch_size = inputs_tensor.shape[0] + device = inputs_tensor.device + self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device) + # 4. Define other model kwargs model_kwargs["output_attentions"] = generation_config.output_attentions model_kwargs["output_hidden_states"] = generation_config.output_hidden_states @@ -182,7 +175,7 @@ def generate( accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys()) requires_attention_mask = "encoder_outputs" not in model_kwargs - if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask: + if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( inputs_tensor, generation_config.pad_token_id, @@ -209,16 +202,15 @@ def generate( # 5. Prepare `input_ids` which will be used for auto-regressive generation if self.config.is_encoder_decoder: - input_ids = self._prepare_decoder_input_ids_for_generation( - batch_size, - decoder_start_token_id=generation_config.decoder_start_token_id, - bos_token_id=generation_config.bos_token_id, + input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( + batch_size=batch_size, + model_input_name=model_input_name, model_kwargs=model_kwargs, + decoder_start_token_id=generation_config.decoder_start_token_id, device=inputs_tensor.device, ) else: - # if decoder-only then inputs_tensor has to be `input_ids` - input_ids = inputs_tensor + input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") # 6. Prepare `max_length` depending on other stopping criteria. input_ids_seq_length = input_ids.shape[-1] @@ -384,7 +376,7 @@ def generate( elif is_sample_gen_mode: # 11. prepare logits warper - logits_warper = self._get_logits_warper(generation_config) + logits_warper = self._get_logits_warper(generation_config, inputs_tensor.device) # 12. expand input_ids with `num_return_sequences` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( @@ -409,7 +401,7 @@ def generate( ) elif is_sample_gen_stream_mode: # 11. prepare logits warper - logits_warper = self._get_logits_warper(generation_config) + logits_warper = self._get_logits_warper(generation_config, inputs_tensor.device) # 12. expand input_ids with `num_return_sequences` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( @@ -471,7 +463,7 @@ def generate( elif is_beam_sample_gen_mode: # 11. prepare logits warper - logits_warper = self._get_logits_warper(generation_config) + logits_warper = self._get_logits_warper(generation_config, inputs_tensor.device) if stopping_criteria.max_length is None: raise ValueError("`max_length` needs to be a stopping_criteria for now.") @@ -577,7 +569,7 @@ def generate( def typeerror(): raise ValueError( - "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]`" + "`force_words_ids` has to either be a `list[list[list[int]]]` or `list[list[int]]`" f"of positive integers, but is {generation_config.force_words_ids}." ) @@ -649,7 +641,7 @@ def sample_stream( logits_warper: Optional[LogitsProcessorList] = None, max_length: Optional[int] = None, pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, + eos_token_id: Optional[Union[int, list[int]]] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, @@ -885,10 +877,10 @@ def init_stream_support(): if __name__ == "__main__": - from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel + from transformers import AutoModelForCausalLM, AutoTokenizer + + init_stream_support() - PreTrainedModel.generate = NewGenerationMixin.generate - PreTrainedModel.sample_stream = NewGenerationMixin.sample_stream model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1a3cc47aaf..5e701c085f 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -1,24 +1,26 @@ +import logging import os import re import textwrap from functools import cached_property -import pypinyin import torch -from hangul_romanize import Transliter -from hangul_romanize.rule import academic from num2words import num2words from spacy.lang.ar import Arabic from spacy.lang.en import English from spacy.lang.es import Spanish +from spacy.lang.hi import Hindi from spacy.lang.ja import Japanese from spacy.lang.zh import Chinese from tokenizers import Tokenizer from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words +logger = logging.getLogger(__name__) + def get_spacy_lang(lang): + """Return Spacy language used for sentence splitting.""" if lang == "zh": return Chinese() elif lang == "ja": @@ -27,8 +29,10 @@ def get_spacy_lang(lang): return Arabic() elif lang == "es": return Spanish() + elif lang == "hi": + return Hindi() else: - # For most languages, Enlish does the job + # For most languages, English does the job return English() @@ -570,6 +574,10 @@ def basic_cleaners(text): def chinese_transliterate(text): + try: + import pypinyin + except ImportError as e: + raise ImportError("Chinese requires: pypinyin") from e return "".join( [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)] ) @@ -582,6 +590,11 @@ def japanese_cleaners(text, katsu): def korean_transliterate(text): + try: + from hangul_romanize import Transliter + from hangul_romanize.rule import academic + except ImportError as e: + raise ImportError("Korean requires: hangul_romanize") from e r = Transliter(academic) return r.translit(text) @@ -611,6 +624,7 @@ def __init__(self, vocab_file=None): "ja": 71, "hu": 224, "ko": 95, + "hi": 150, } @cached_property @@ -623,8 +637,10 @@ def check_input_length(self, txt, lang): lang = lang.split("-")[0] # remove the region limit = self.char_limits.get(lang, 250) if len(txt) > limit: - print( - f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." + logger.warning( + "The text length exceeds the character limit of %d for language '%s', this might cause truncated audio.", + limit, + lang, ) def preprocess_text(self, txt, lang): diff --git a/TTS/tts/layers/xtts/trainer/dataset.py b/TTS/tts/layers/xtts/trainer/dataset.py index 2f958cb5a5..e598232665 100644 --- a/TTS/tts/layers/xtts/trainer/dataset.py +++ b/TTS/tts/layers/xtts/trainer/dataset.py @@ -1,4 +1,4 @@ -import os +import logging import random import sys @@ -8,6 +8,8 @@ from TTS.tts.models.xtts import load_audio +logger = logging.getLogger(__name__) + torch.set_num_threads(1) @@ -71,13 +73,13 @@ def __init__(self, config, samples, tokenizer, sample_rate, is_eval=False): random.shuffle(self.samples) # order by language self.samples = key_samples_by_col(self.samples, "language") - print(" > Sampling by language:", self.samples.keys()) + logger.info("Sampling by language: %s", self.samples.keys()) else: # for evaluation load and check samples that are corrupted to ensures the reproducibility self.check_eval_samples() def check_eval_samples(self): - print(" > Filtering invalid eval samples!!") + logger.info("Filtering invalid eval samples!!") new_samples = [] for sample in self.samples: try: @@ -93,7 +95,7 @@ def check_eval_samples(self): continue new_samples.append(sample) self.samples = new_samples - print(" > Total eval samples after filtering:", len(self.samples)) + logger.info("Total eval samples after filtering: %d", len(self.samples)) def get_text(self, text, lang): tokens = self.tokenizer.encode(text, lang) @@ -151,7 +153,7 @@ def __getitem__(self, index): # ignore samples that we already know that is not valid ones if sample_id in self.failed_samples: if self.debug_failures: - print(f"Ignoring sample {sample['audio_file']} because it was already ignored before !!") + logger.info("Ignoring sample %s because it was already ignored before !!", sample["audio_file"]) # call get item again to get other sample return self[1] @@ -160,7 +162,7 @@ def __getitem__(self, index): tseq, audiopath, wav, cond, cond_len, cond_idxs = self.load_item(sample) except: if self.debug_failures: - print(f"error loading {sample['audio_file']} {sys.exc_info()}") + logger.warning("Error loading %s %s", sample["audio_file"], sys.exc_info()) self.failed_samples.add(sample_id) return self[1] @@ -173,8 +175,11 @@ def __getitem__(self, index): # Basically, this audio file is nonexistent or too long to be supported by the dataset. # It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result. if self.debug_failures and wav is not None and tseq is not None: - print( - f"error loading {sample['audio_file']}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}" + logger.warning( + "Error loading %s: ranges are out of bounds: %d, %d", + sample["audio_file"], + wav.shape[-1], + tseq.shape[0], ) self.failed_samples.add(sample_id) return self[1] @@ -187,9 +192,9 @@ def __getitem__(self, index): "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long), "filenames": audiopath, "conditioning": cond.unsqueeze(1), - "cond_lens": torch.tensor(cond_len, dtype=torch.long) - if cond_len is not torch.nan - else torch.tensor([cond_len]), + "cond_lens": ( + torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len]) + ), "cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]), } return res diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 9a7a1d7783..04d123778b 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -1,3 +1,4 @@ +import logging from dataclasses import dataclass, field from typing import Dict, List, Tuple, Union @@ -5,8 +6,8 @@ import torch.nn as nn import torchaudio from coqpit import Coqpit -from torch.nn import functional as F from torch.utils.data import DataLoader +from trainer.io import load_fsspec from trainer.torch import DistributedSampler from trainer.trainer_utils import get_optimizer, get_scheduler @@ -18,7 +19,8 @@ from TTS.tts.layers.xtts.trainer.dataset import XTTSDataset from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) @dataclass @@ -58,7 +60,7 @@ def callback_clearml_load_save(operation_type, model_info): # return None means skip the file upload/log, returning model_info will continue with the log/upload # you can also change the upload destination file name model_info.upload_filename or check the local file size with Path(model_info.local_model_path).stat().st_size assert operation_type in ("load", "save") - # print(operation_type, model_info.__dict__) + logger.debug("%s %s", operation_type, model_info.__dict__) if "similarities.pth" in model_info.__dict__["local_model_path"]: return None @@ -92,7 +94,7 @@ def __init__(self, config: Coqpit): gpt_checkpoint = torch.load(self.args.gpt_checkpoint, map_location=torch.device("cpu")) # deal with coqui Trainer exported model if "model" in gpt_checkpoint.keys() and "config" in gpt_checkpoint.keys(): - print("Coqui Trainer checkpoint detected! Converting it!") + logger.info("Coqui Trainer checkpoint detected! Converting it!") gpt_checkpoint = gpt_checkpoint["model"] states_keys = list(gpt_checkpoint.keys()) for key in states_keys: @@ -111,7 +113,7 @@ def __init__(self, config: Coqpit): num_new_tokens = ( self.xtts.gpt.text_embedding.weight.shape[0] - gpt_checkpoint["text_embedding.weight"].shape[0] ) - print(f" > Loading checkpoint with {num_new_tokens} additional tokens.") + logger.info("Loading checkpoint with %d additional tokens.", num_new_tokens) # add new tokens to a linear layer (text_head) emb_g = gpt_checkpoint["text_embedding.weight"] @@ -138,7 +140,7 @@ def __init__(self, config: Coqpit): gpt_checkpoint["text_head.bias"] = text_head_bias self.xtts.gpt.load_state_dict(gpt_checkpoint, strict=True) - print(">> GPT weights restored from:", self.args.gpt_checkpoint) + logger.info("GPT weights restored from: %s", self.args.gpt_checkpoint) # Mel spectrogram extractor for conditioning if self.args.gpt_use_perceiver_resampler: @@ -184,7 +186,7 @@ def __init__(self, config: Coqpit): if self.args.dvae_checkpoint: dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu")) self.dvae.load_state_dict(dvae_checkpoint, strict=False) - print(">> DVAE weights restored from:", self.args.dvae_checkpoint) + logger.info("DVAE weights restored from: %s", self.args.dvae_checkpoint) else: raise RuntimeError( "You need to specify config.model_args.dvae_checkpoint path to be able to train the GPT decoder!!" @@ -230,7 +232,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 # init gpt for inference mode self.xtts.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=False) self.xtts.gpt.eval() - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") for idx, s_info in enumerate(self.config.test_sentences): wav = self.xtts.synthesize( s_info["text"], @@ -391,7 +393,7 @@ def get_data_loader( loader = DataLoader( dataset, sampler=sampler, - batch_size = config.eval_batch_size if is_eval else config.batch_size, + batch_size=config.eval_batch_size if is_eval else config.batch_size, collate_fn=dataset.collate_fn, num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, pin_memory=False, diff --git a/TTS/tts/layers/xtts/xtts_manager.py b/TTS/tts/layers/xtts/xtts_manager.py index 3e7d0f6c91..5560e87687 100644 --- a/TTS/tts/layers/xtts/xtts_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -1,34 +1,35 @@ import torch -class SpeakerManager(): + +class SpeakerManager: def __init__(self, speaker_file_path=None): self.speakers = torch.load(speaker_file_path) @property def name_to_id(self): - return self.speakers.keys() - + return self.speakers + @property def num_speakers(self): return len(self.name_to_id) - + @property def speaker_names(self): return list(self.name_to_id.keys()) - -class LanguageManager(): + +class LanguageManager: def __init__(self, config): self.langs = config["languages"] @property def name_to_id(self): return self.langs - + @property def num_languages(self): return len(self.name_to_id) - + @property def language_names(self): return list(self.name_to_id) diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index e59ccb6630..69b8dae952 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -4,13 +4,14 @@ import argparse import csv -import os +import logging import re import string import sys -# fmt: off +logger = logging.getLogger(__name__) +# fmt: off # ================================================================================ # # basic constant # ================================================================================ # @@ -491,8 +492,6 @@ class NumberSystem(object): 中文数字įŗģįģŸ """ - pass - class MathSymbol(object): """ @@ -927,12 +926,13 @@ def percentage2chntext(self): def normalize_nsw(raw_text): text = "^" + raw_text + "$" + logger.debug(text) # č§„čŒƒåŒ–æ—Ĩ期 pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})åš´)?(\d{1,2}月(\d{1,2}[æ—Ĩåˇ])?)?)") matchers = pattern.findall(text) if matchers: - # print('date') + logger.debug("date") for matcher in matchers: text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) @@ -940,7 +940,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"\D+((\d+(\.\d+)?)[多äŊ™å‡ ]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") matchers = pattern.findall(text) if matchers: - # print('money') + logger.debug("money") for matcher in matchers: text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) @@ -953,14 +953,14 @@ def normalize_nsw(raw_text): pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") matchers = pattern.findall(text) if matchers: - # print('telephone') + logger.debug("telephone") for matcher in matchers: text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) # å›ēč¯ pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") matchers = pattern.findall(text) if matchers: - # print('fixed telephone') + logger.debug("fixed telephone") for matcher in matchers: text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) @@ -968,7 +968,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+/\d+)") matchers = pattern.findall(text) if matchers: - # print('fraction') + logger.debug("fraction") for matcher in matchers: text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) @@ -977,7 +977,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+(\.\d+)?%)") matchers = pattern.findall(text) if matchers: - # print('percentage') + logger.debug("percentage") for matcher in matchers: text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) @@ -985,7 +985,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+(\.\d+)?)[多äŊ™å‡ ]?" + COM_QUANTIFIERS) matchers = pattern.findall(text) if matchers: - # print('cardinal+quantifier') + logger.debug("cardinal+quantifier") for matcher in matchers: text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) @@ -993,7 +993,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d{4,32})") matchers = pattern.findall(text) if matchers: - # print('digit') + logger.debug("digit") for matcher in matchers: text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) @@ -1001,7 +1001,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+(\.\d+)?)") matchers = pattern.findall(text) if matchers: - # print('cardinal') + logger.debug("cardinal") for matcher in matchers: text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) @@ -1009,7 +1009,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(([a-zA-Z]+)äēŒ([a-zA-Z]+))") matchers = pattern.findall(text) if matchers: - # print('particular') + logger.debug("particular") for matcher in matchers: text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1) @@ -1107,7 +1107,7 @@ def __call__(self, text): if self.check_chars: for c in text: if not IN_VALID_CHARS.get(c): - print(f"WARNING: illegal char {c} in: {text}", file=sys.stderr) + logger.warning("Illegal char %s in: %s", c, text) return "" if self.remove_space: diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 2bd2e5f087..ebfa171c80 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,10 +1,13 @@ +import logging from typing import Dict, List, Union from TTS.utils.generic_utils import find_module +logger = logging.getLogger(__name__) + def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": - print(" > Using model: {}".format(config.model)) + logger.info("Using model: %s", config.model) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: MyModel = find_module("TTS.tts.models", config.base_model.lower()) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index b2e51de7d6..2d27a57850 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -4,6 +4,7 @@ import torch from coqpit import Coqpit from torch import nn +from trainer.io import load_fsspec from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder @@ -15,7 +16,6 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.io import load_fsspec @dataclass @@ -415,7 +415,7 @@ def _set_phase(config, global_step): """Decide AlignTTS training phase""" if isinstance(config.phase_start_steps, list): vals = [i < global_step for i in config.phase_start_steps] - if not True in vals: + if True not in vals: phase = 0 else: phase = ( diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index e5edffd4ef..cdfb5efae4 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -164,7 +164,7 @@ def generate_audio( return audio_arr, [x_semantic, c, f] def generate_voice(self, audio, speaker_id, voice_dir): - """Generate a voice from the given audio and text. + """Generate a voice from the given audio. Args: audio (str): Path to the audio file. @@ -174,7 +174,7 @@ def generate_voice(self, audio, speaker_id, voice_dir): if voice_dir is not None: voice_dirs = [voice_dir] try: - _ = load_voice(speaker_id, voice_dirs) + _ = load_voice(self, speaker_id, voice_dirs) except (KeyError, FileNotFoundError): output_path = os.path.join(voice_dir, speaker_id + ".npz") os.makedirs(voice_dir, exist_ok=True) @@ -225,14 +225,11 @@ def synthesize( return return_dict - def eval_step(self): - ... + def eval_step(self): ... - def forward(self): - ... + def forward(self): ... - def inference(self): - ... + def inference(self): ... @staticmethod def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index f38dace235..79cdf1a7d4 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -1,10 +1,12 @@ import copy +import logging from abc import abstractmethod from typing import Dict, Tuple import torch from coqpit import Coqpit from torch import nn +from trainer.io import load_fsspec from TTS.tts.layers.losses import TacotronLoss from TTS.tts.models.base_tts import BaseTTS @@ -14,9 +16,10 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.generic_utils import format_aux_input -from TTS.utils.io import load_fsspec from TTS.utils.training import gradual_training_scheduler +logger = logging.getLogger(__name__) + class BaseTacotron(BaseTTS): """Base class shared by Tacotron and Tacotron2""" @@ -100,7 +103,8 @@ def load_checkpoint( config (Coqpi): model configuration. checkpoint_path (str): path to checkpoint file. eval (bool, optional): whether to load model for evaluation. - cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False. + cache (bool, optional): If True, cache the file locally for subsequent calls. + It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False. """ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) @@ -116,7 +120,7 @@ def load_checkpoint( self.decoder.set_r(config.r) if eval: self.eval() - print(f" > Model's reduction rate `r` is set to: {self.decoder.r}") + logger.info("Model's reduction rate `r` is set to: %d", self.decoder.r) assert not self.training def get_criterion(self) -> nn.Module: @@ -148,7 +152,7 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: Returns: Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. """ - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -302,4 +306,4 @@ def on_epoch_start(self, trainer): self.decoder.set_r(r) if trainer.config.bidirectional_decoder: trainer.model.decoder_backward.set_r(r) - print(f"\n > Number of output frames: {self.decoder.r}") + logger.info("Number of output frames: %d", self.decoder.r) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 7871cc38c3..ccb023ce84 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -1,3 +1,4 @@ +import logging import os import random from typing import Dict, List, Tuple, Union @@ -14,10 +15,12 @@ from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.data import get_length_balancer_weights from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +logger = logging.getLogger(__name__) + # pylint: skip-file @@ -105,7 +108,7 @@ def init_multispeaker(self, config: Coqpit, data: List = None): ) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: - print(" > Init speaker_embedding layer.") + logger.info("Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) @@ -141,7 +144,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): if speaker_name is None: d_vector = self.speaker_manager.get_random_embedding() else: - d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name) + d_vector = self.speaker_manager.get_mean_embedding(speaker_name) elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_id() @@ -245,12 +248,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): if getattr(config, "use_language_weighted_sampler", False): alpha = getattr(config, "language_weighted_sampler_alpha", 1.0) - print(" > Using Language weighted sampler with alpha:", alpha) + logger.info("Using Language weighted sampler with alpha: %.2f", alpha) weights = get_language_balancer_weights(data_items) * alpha if getattr(config, "use_speaker_weighted_sampler", False): alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0) - print(" > Using Speaker weighted sampler with alpha:", alpha) + logger.info("Using Speaker weighted sampler with alpha: %.2f", alpha) if weights is not None: weights += get_speaker_balancer_weights(data_items) * alpha else: @@ -258,7 +261,7 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): if getattr(config, "use_length_weighted_sampler", False): alpha = getattr(config, "length_weighted_sampler_alpha", 1.0) - print(" > Using Length weighted sampler with alpha:", alpha) + logger.info("Using Length weighted sampler with alpha: %.2f", alpha) if weights is not None: weights += get_length_balancer_weights(data_items) * alpha else: @@ -330,7 +333,6 @@ def get_data_loader( phoneme_cache_path=config.phoneme_cache_path, precompute_num_workers=config.precompute_num_workers, use_noise_augment=False if is_eval else config.use_noise_augment, - verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, tokenizer=self.tokenizer, @@ -369,9 +371,11 @@ def _get_test_aux_input( d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), + "speaker_id": ( + None + if not self.config.use_speaker_embedding + else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1) + ), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } @@ -388,7 +392,7 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: Returns: Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. """ - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -427,8 +431,8 @@ def on_init_start(self, trainer): if hasattr(trainer.config, "model_args"): trainer.config.model_args.speakers_file = output_path trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) - print(f" > `speakers.pth` is saved to {output_path}.") - print(" > `speakers_file` is updated in the config.json.") + logger.info("`speakers.pth` is saved to: %s", output_path) + logger.info("`speakers_file` is updated in the config.json.") if self.language_manager is not None: output_path = os.path.join(trainer.output_path, "language_ids.json") @@ -437,8 +441,8 @@ def on_init_start(self, trainer): if hasattr(trainer.config, "model_args"): trainer.config.model_args.language_ids_file = output_path trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) - print(f" > `language_ids.json` is saved to {output_path}.") - print(" > `language_ids_file` is updated in the config.json.") + logger.info("`language_ids.json` is saved to: %s", output_path) + logger.info("`language_ids_file` is updated in the config.json.") class BaseTTSE2E(BaseTTS): diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index b1cf886bea..a938a3a4ab 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -1,3 +1,4 @@ +import logging import os from dataclasses import dataclass, field from itertools import chain @@ -15,6 +16,7 @@ from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.data.sampler import WeightedRandomSampler +from trainer.io import load_fsspec from trainer.torch import DistributedSampler, DistributedSamplerWrapper from trainer.trainer_utils import get_optimizer, get_scheduler @@ -31,11 +33,12 @@ from TTS.utils.audio.numpy_transforms import db_to_amp as db_to_amp_numpy from TTS.utils.audio.numpy_transforms import mel_to_wav as mel_to_wav_numpy from TTS.utils.audio.processor import AudioProcessor -from TTS.utils.io import load_fsspec from TTS.vocoder.layers.losses import MultiScaleSTFTLoss from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results +logger = logging.getLogger(__name__) + def id_to_torch(aux_id, cuda=False): if aux_id is not None: @@ -85,12 +88,6 @@ def pad(input_ele: List[torch.Tensor], max_len: int) -> torch.Tensor: return out_padded -def init_weights(m: nn.Module, mean: float = 0.0, std: float = 0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor: return torch.ceil(lens / stride).int() @@ -162,9 +159,9 @@ def _wav_to_spec(y, n_fft, hop_length, win_length, center=False): y = y.squeeze(1) if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) + logger.info("min value is %.3f", torch.min(y)) if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + logger.info("max value is %.3f", torch.max(y)) global hann_window # pylint: disable=global-statement dtype_device = str(y.dtype) + "_" + str(y.device) @@ -179,17 +176,19 @@ def _wav_to_spec(y, n_fft, hop_length, win_length, center=False): ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) return spec @@ -251,9 +250,9 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm y = y.squeeze(1) if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) + logger.info("min value is %.3f", torch.min(y)) if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + logger.info("max value is %.3f", torch.max(y)) global mel_basis, hann_window # pylint: disable=global-statement mel_basis_key = name_mel_basis(y, n_fft, fmax) @@ -274,17 +273,19 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -324,7 +325,6 @@ def __init__( self, ap, samples: Union[List[List], List[Dict]], - verbose=False, cache_path: str = None, precompute_num_workers=0, normalize_f0=True, @@ -332,7 +332,6 @@ def __init__( super().__init__( samples=samples, ap=ap, - verbose=verbose, cache_path=cache_path, precompute_num_workers=precompute_num_workers, normalize_f0=normalize_f0, @@ -404,7 +403,7 @@ def __getitem__(self, idx): try: token_ids = self.get_token_ids(idx, item["text"]) except: - print(idx, item) + logger.exception("%s %s", idx, item) # pylint: disable=raise-missing-from raise OSError f0 = None @@ -769,7 +768,7 @@ def init_multispeaker(self, config: Coqpit): def _init_speaker_embedding(self): # pylint: disable=attribute-defined-outside-init if self.num_speakers > 0: - print(" > initialization of speaker-embedding layers.") + logger.info("Initialization of speaker-embedding layers.") self.embedded_speaker_dim = self.args.speaker_embedding_channels self.args.embedded_speaker_dim = self.args.speaker_embedding_channels @@ -1287,7 +1286,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: Returns: Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. """ - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -1401,14 +1400,14 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): data_items = dataset.samples if getattr(config, "use_weighted_sampler", False): for attr_name, alpha in config.weighted_sampler_attrs.items(): - print(f" > Using weighted sampler for attribute '{attr_name}' with alpha '{alpha}'") + logger.info("Using weighted sampler for attribute '%s' with alpha %.2f", attr_name, alpha) multi_dict = config.weighted_sampler_multipliers.get(attr_name, None) - print(multi_dict) + logger.info(multi_dict) weights, attr_names, attr_weights = get_attribute_balancer_weights( attr_name=attr_name, items=data_items, multi_dict=multi_dict ) weights = weights * alpha - print(f" > Attribute weights for '{attr_names}' \n | > {attr_weights}") + logger.info("Attribute weights for '%s' \n | > %s", attr_names, attr_weights) if weights is not None: sampler = WeightedRandomSampler(weights, len(weights)) @@ -1448,7 +1447,6 @@ def get_data_loader( compute_f0=config.compute_f0, f0_cache_path=config.f0_cache_path, attn_prior_cache_path=config.attn_prior_cache_path if config.use_attn_priors else None, - verbose=verbose, tokenizer=self.tokenizer, start_by_longest=config.start_by_longest, ) @@ -1525,7 +1523,7 @@ def on_epoch_end(self, trainer): # pylint: disable=unused-argument @staticmethod def init_from_config( - config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=False + config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None ): # pylint: disable=unused-argument """Initiate model from config diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index b6e9ac8a14..4b74462dd5 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -1,3 +1,4 @@ +import logging from dataclasses import dataclass, field from typing import Dict, List, Tuple, Union @@ -5,6 +6,7 @@ from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast +from trainer.io import load_fsspec from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.encoder import Encoder @@ -16,7 +18,8 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) @dataclass @@ -299,11 +302,11 @@ def init_multispeaker(self, config: Coqpit): if config.use_d_vector_file: self.embedded_speaker_dim = config.d_vector_dim if self.args.d_vector_dim != self.args.hidden_channels: - #self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1) + # self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1) self.proj_g = nn.Linear(in_features=self.args.d_vector_dim, out_features=self.args.hidden_channels) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: - print(" > Init speaker_embedding layer.") + logger.info("Init speaker_embedding layer.") self.emb_g = nn.Embedding(self.num_speakers, self.args.hidden_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @@ -404,13 +407,13 @@ def _forward_encoder( # [B, T, C] x_emb = self.emb(x) # encoder pass - #o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) + # o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask, g) # speaker conditioning # TODO: try different ways of conditioning - if g is not None: + if g is not None: if hasattr(self, "proj_g"): - g = self.proj_g(g.view(g.shape[0], -1)).unsqueeze(-1) + g = self.proj_g(g.view(g.shape[0], -1)).unsqueeze(-1) o_en = o_en + g return o_en, x_mask, g, x_emb diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index bfd1a2b618..64954d283c 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -1,3 +1,4 @@ +import logging import math from typing import Dict, List, Tuple, Union @@ -6,6 +7,7 @@ from torch import nn from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F +from trainer.io import load_fsspec from TTS.tts.configs.glow_tts_config import GlowTTSConfig from TTS.tts.layers.glow_tts.decoder import Decoder @@ -16,7 +18,8 @@ from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) class GlowTTS(BaseTTS): @@ -53,7 +56,7 @@ class GlowTTS(BaseTTS): >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig >>> from TTS.tts.models.glow_tts import GlowTTS >>> config = GlowTTSConfig() - >>> model = GlowTTS.init_from_config(config, verbose=False) + >>> model = GlowTTS.init_from_config(config) """ def __init__( @@ -127,7 +130,7 @@ def init_multispeaker(self, config: Coqpit): ), " [!] d-vector dimension mismatch b/w config and speaker manager." # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: - print(" > Init speaker_embedding layer.") + logger.info("Init speaker_embedding layer.") self.embedded_speaker_dim = self.hidden_channels_enc self.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @@ -479,13 +482,13 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: Returns: Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. """ - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences aux_inputs = self._get_test_aux_input() if len(test_sentences) == 0: - print(" | [!] No test sentences provided.") + logger.warning("No test sentences provided.") else: for idx, sen in enumerate(test_sentences): outputs = synthesis( @@ -540,18 +543,17 @@ def on_train_step_start(self, trainer): self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps @staticmethod - def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None): """Initiate model from config Args: config (VitsConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. - verbose (bool): If True, print init messages. Defaults to True. """ from TTS.utils.audio import AudioProcessor - ap = AudioProcessor.init_from_config(config, verbose) + ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return GlowTTS(new_config, ap, tokenizer, speaker_manager) diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index e241410872..277369e644 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -1,9 +1,11 @@ +import logging import os from typing import Dict, List, Union import torch from coqpit import Coqpit from torch import nn +from trainer.io import load_fsspec from trainer.logging.tensorboard_logger import TensorboardLogger from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils @@ -17,7 +19,8 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.generic_utils import format_aux_input -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) class NeuralhmmTTS(BaseTTS): @@ -235,18 +238,17 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None): """Initiate model from config Args: config (VitsConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. - verbose (bool): If True, print init messages. Defaults to True. """ from TTS.utils.audio import AudioProcessor - ap = AudioProcessor.init_from_config(config, verbose) + ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return NeuralhmmTTS(new_config, ap, tokenizer, speaker_manager) @@ -266,14 +268,17 @@ def on_init_start(self, trainer): dataloader = trainer.get_train_dataloader( training_assets=None, samples=trainer.train_samples, verbose=False ) - print( - f" | > Data parameters not found for: {trainer.config.mel_statistics_parameter_path}. Computing mel normalization parameters..." + logger.info( + "Data parameters not found for: %s. Computing mel normalization parameters...", + trainer.config.mel_statistics_parameter_path, ) data_mean, data_std, init_transition_prob = OverflowUtils.get_data_parameters_for_flat_start( dataloader, trainer.config.out_channels, trainer.config.state_per_phone ) - print( - f" | > Saving data parameters to: {trainer.config.mel_statistics_parameter_path}: value: {data_mean, data_std, init_transition_prob}" + logger.info( + "Saving data parameters to: %s: value: %s", + trainer.config.mel_statistics_parameter_path, + (data_mean, data_std, init_transition_prob), ) statistics = { "mean": data_mean.item(), @@ -283,8 +288,9 @@ def on_init_start(self, trainer): torch.save(statistics, trainer.config.mel_statistics_parameter_path) else: - print( - f" | > Data parameters found for: {trainer.config.mel_statistics_parameter_path}. Loading mel normalization parameters..." + logger.info( + "Data parameters found for: %s. Loading mel normalization parameters...", + trainer.config.mel_statistics_parameter_path, ) statistics = torch.load(trainer.config.mel_statistics_parameter_path) data_mean, data_std, init_transition_prob = ( @@ -292,7 +298,7 @@ def on_init_start(self, trainer): statistics["std"], statistics["init_transition_prob"], ) - print(f" | > Data parameters loaded with value: {data_mean, data_std, init_transition_prob}") + logger.info("Data parameters loaded with value: %s", (data_mean, data_std, init_transition_prob)) trainer.config.flat_start_params["transition_p"] = ( init_transition_prob.item() if torch.is_tensor(init_transition_prob) else init_transition_prob @@ -318,7 +324,7 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus } # sample one item from the batch -1 will give the smalles item - print(" | > Synthesising audio from the model...") + logger.info("Synthesising audio from the model...") inference_output = self.inference( batch["text_input"][-1].unsqueeze(0), aux_input={"x_lengths": batch["text_lengths"][-1].unsqueeze(0)} ) diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index 92b3c767de..b05b75009b 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -1,9 +1,11 @@ +import logging import os from typing import Dict, List, Union import torch from coqpit import Coqpit from torch import nn +from trainer.io import load_fsspec from trainer.logging.tensorboard_logger import TensorboardLogger from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils @@ -18,7 +20,8 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.generic_utils import format_aux_input -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) class Overflow(BaseTTS): @@ -250,18 +253,17 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None): """Initiate model from config Args: config (VitsConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. - verbose (bool): If True, print init messages. Defaults to True. """ from TTS.utils.audio import AudioProcessor - ap = AudioProcessor.init_from_config(config, verbose) + ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return Overflow(new_config, ap, tokenizer, speaker_manager) @@ -282,14 +284,17 @@ def on_init_start(self, trainer): dataloader = trainer.get_train_dataloader( training_assets=None, samples=trainer.train_samples, verbose=False ) - print( - f" | > Data parameters not found for: {trainer.config.mel_statistics_parameter_path}. Computing mel normalization parameters..." + logger.info( + "Data parameters not found for: %s. Computing mel normalization parameters...", + trainer.config.mel_statistics_parameter_path, ) data_mean, data_std, init_transition_prob = OverflowUtils.get_data_parameters_for_flat_start( dataloader, trainer.config.out_channels, trainer.config.state_per_phone ) - print( - f" | > Saving data parameters to: {trainer.config.mel_statistics_parameter_path}: value: {data_mean, data_std, init_transition_prob}" + logger.info( + "Saving data parameters to: %s: value: %s", + trainer.config.mel_statistics_parameter_path, + (data_mean, data_std, init_transition_prob), ) statistics = { "mean": data_mean.item(), @@ -299,8 +304,9 @@ def on_init_start(self, trainer): torch.save(statistics, trainer.config.mel_statistics_parameter_path) else: - print( - f" | > Data parameters found for: {trainer.config.mel_statistics_parameter_path}. Loading mel normalization parameters..." + logger.info( + "Data parameters found for: %s. Loading mel normalization parameters...", + trainer.config.mel_statistics_parameter_path, ) statistics = torch.load(trainer.config.mel_statistics_parameter_path) data_mean, data_std, init_transition_prob = ( @@ -308,7 +314,7 @@ def on_init_start(self, trainer): statistics["std"], statistics["init_transition_prob"], ) - print(f" | > Data parameters loaded with value: {data_mean, data_std, init_transition_prob}") + logger.info("Data parameters loaded with value: %s", (data_mean, data_std, init_transition_prob)) trainer.config.flat_start_params["transition_p"] = ( init_transition_prob.item() if torch.is_tensor(init_transition_prob) else init_transition_prob @@ -334,7 +340,7 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus } # sample one item from the batch -1 will give the smalles item - print(" | > Synthesising audio from the model...") + logger.info("Synthesising audio from the model...") inference_output = self.inference( batch["text_input"][-1].unsqueeze(0), aux_input={"x_lengths": batch["text_lengths"][-1].unsqueeze(0)} ) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 474ec4641d..400a86d042 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -101,12 +101,16 @@ def __init__( num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, - speaker_embedding_dim=self.embedded_speaker_dim - if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding - else None, - text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + speaker_embedding_dim=( + self.embedded_speaker_dim + if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding + else None + ), + text_summary_embedding_dim=( + self.capacitron_vae.capacitron_text_summary_embedding_dim + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), ) # backward pass decoder @@ -171,9 +175,9 @@ def forward( # pylint: disable=dangerous-default-value encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, reference_mel_info=[mel_specs, mel_lengths], - text_info=[inputs, text_lengths] - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + text_info=( + [inputs, text_lengths] if self.capacitron_vae.capacitron_use_text_summary_embeddings else None + ), speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, ) else: @@ -237,13 +241,13 @@ def inference(self, text_input, aux_input=None): # B x capacitron_VAE_embedding_dim encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, - reference_mel_info=[aux_input["style_mel"], reference_mel_length] - if aux_input["style_mel"] is not None - else None, + reference_mel_info=( + [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None + ), text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, - speaker_embedding=aux_input["d_vectors"] - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, + speaker_embedding=( + aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), ) if self.num_speakers > 1: if not self.use_d_vector_file: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 71ab1eac37..4b1317f440 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -113,12 +113,14 @@ def __init__( num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, - speaker_embedding_dim=self.embedded_speaker_dim - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, - text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + speaker_embedding_dim=( + self.embedded_speaker_dim if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), + text_summary_embedding_dim=( + self.capacitron_vae.capacitron_text_summary_embedding_dim + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), ) # backward pass decoder @@ -191,9 +193,11 @@ def forward( # pylint: disable=dangerous-default-value encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, reference_mel_info=[mel_specs, mel_lengths], - text_info=[embedded_inputs.transpose(1, 2), text_lengths] - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + text_info=( + [embedded_inputs.transpose(1, 2), text_lengths] + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, ) else: @@ -265,13 +269,13 @@ def inference(self, text, aux_input=None): # B x capacitron_VAE_embedding_dim encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, - reference_mel_info=[aux_input["style_mel"], reference_mel_length] - if aux_input["style_mel"] is not None - else None, + reference_mel_info=( + [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None + ), text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, - speaker_embedding=aux_input["d_vectors"] - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, + speaker_embedding=( + aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), ) if self.num_speakers > 1: diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 16644ff95e..17303c69f7 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -1,3 +1,4 @@ +import logging import os import random from contextlib import contextmanager @@ -23,6 +24,8 @@ from TTS.tts.layers.tortoise.wav2vec_alignment import Wav2VecAlignment from TTS.tts.models.base_tts import BaseTTS +logger = logging.getLogger(__name__) + def pad_or_truncate(t, length): """ @@ -100,7 +103,7 @@ def fix_autoregressive_output(codes, stop_token, complain=True): stop_token_indices = (codes == stop_token).nonzero() if len(stop_token_indices) == 0: if complain: - print( + logger.warning( "No stop tokens found in one of the generated voice clips. This typically means the spoken audio is " "too long. In some cases, the output will still be good, though. Listen to it and if it is missing words, " "try breaking up your input text." @@ -713,10 +716,10 @@ def inference( 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" ) self.autoregressive = self.autoregressive.to(self.device) - if verbose: - print("Generating autoregressive samples..") - with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=half + logger.info("Generating autoregressive samples..") + with ( + self.temporary_cuda(self.autoregressive) as autoregressive, + torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half), ): for b in tqdm(range(num_batches), disable=not verbose): codes = autoregressive.inference_speech( @@ -737,8 +740,9 @@ def inference( self.autoregressive_batch_size = orig_batch_size # in the case of single_sample clip_results = [] - with self.temporary_cuda(self.clvp) as clvp, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=half + with ( + self.temporary_cuda(self.clvp) as clvp, + torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half), ): for batch in tqdm(samples, disable=not verbose): for i in range(batch.shape[0]): @@ -773,8 +777,7 @@ def inference( ) del auto_conditioning - if verbose: - print("Transforming autoregressive outputs into audio..") + logger.info("Transforming autoregressive outputs into audio..") wav_candidates = [] for b in range(best_results.shape[0]): codes = best_results[b].unsqueeze(0) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d9b1f59618..b014e4fdde 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1,3 +1,4 @@ +import logging import math import os from dataclasses import dataclass, field, replace @@ -15,6 +16,7 @@ from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.data.sampler import WeightedRandomSampler +from trainer.io import load_fsspec from trainer.torch import DistributedSampler, DistributedSamplerWrapper from trainer.trainer_utils import get_optimizer, get_scheduler @@ -33,11 +35,12 @@ from TTS.tts.utils.text.characters import BaseCharacters, BaseVocabulary, _characters, _pad, _phonemes, _punctuations from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment -from TTS.utils.io import load_fsspec from TTS.utils.samplers import BucketBatchSampler from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results +logger = logging.getLogger(__name__) + ############################## # IO / Feature extraction ############################## @@ -104,9 +107,9 @@ def wav_to_spec(y, n_fft, hop_length, win_length, center=False): y = y.squeeze(1) if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) + logger.info("min value is %.3f", torch.min(y)) if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + logger.info("max value is %.3f", torch.max(y)) global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -121,17 +124,19 @@ def wav_to_spec(y, n_fft, hop_length, win_length, center=False): ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -168,9 +173,9 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm y = y.squeeze(1) if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) + logger.info("min value is %.3f", torch.min(y)) if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + logger.info("max value is %.3f", torch.max(y)) global mel_basis, hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -189,17 +194,19 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -760,7 +767,7 @@ def init_multispeaker(self, config: Coqpit): ) self.speaker_manager.encoder.eval() - print(" > External Speaker Encoder Loaded !!") + logger.info("External Speaker Encoder Loaded !!") if ( hasattr(self.speaker_manager.encoder, "audio_config") @@ -774,7 +781,7 @@ def init_multispeaker(self, config: Coqpit): def _init_speaker_embedding(self): # pylint: disable=attribute-defined-outside-init if self.num_speakers > 0: - print(" > initialization of speaker-embedding layers.") + logger.info("Initialization of speaker-embedding layers.") self.embedded_speaker_dim = self.args.speaker_embedding_channels self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) @@ -794,7 +801,7 @@ def init_multilingual(self, config: Coqpit): self.language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) if self.args.use_language_embedding and self.language_manager: - print(" > initialization of language-embedding layers.") + logger.info("Initialization of language-embedding layers.") self.num_languages = self.language_manager.num_languages self.embedded_language_dim = self.args.embedded_language_dim self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) @@ -829,7 +836,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 for key, value in after_dict.items(): if value == before_dict[key]: raise RuntimeError(" [!] The weights of Duration Predictor was not reinit check it !") - print(" > Duration Predictor was reinit.") + logger.info("Duration Predictor was reinit.") if self.args.reinit_text_encoder: before_dict = get_module_weights_sum(self.text_encoder) @@ -839,7 +846,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 for key, value in after_dict.items(): if value == before_dict[key]: raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !") - print(" > Text Encoder was reinit.") + logger.info("Text Encoder was reinit.") def get_aux_input(self, aux_input: Dict): sid, g, lid, _ = self._set_cond_input(aux_input) @@ -1233,7 +1240,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> T Args: batch (Dict): Input tensors. criterion (nn.Module): Loss layer designed for the model. - optimizer_idx (int): Index of optimizer to use. 0 for the generator and 1 for the discriminator networks. + optimizer_idx (int): Index of optimizer to use. 0 for the discriminator and 1 for the generator networks. Returns: Tuple[Dict, Dict]: Model ouputs and computed losses. @@ -1433,7 +1440,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: Returns: Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. """ - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -1550,14 +1557,14 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=F data_items = dataset.samples if getattr(config, "use_weighted_sampler", False): for attr_name, alpha in config.weighted_sampler_attrs.items(): - print(f" > Using weighted sampler for attribute '{attr_name}' with alpha '{alpha}'") + logger.info("Using weighted sampler for attribute '%s' with alpha %.3f", attr_name, alpha) multi_dict = config.weighted_sampler_multipliers.get(attr_name, None) - print(multi_dict) + logger.info(multi_dict) weights, attr_names, attr_weights = get_attribute_balancer_weights( attr_name=attr_name, items=data_items, multi_dict=multi_dict ) weights = weights * alpha - print(f" > Attribute weights for '{attr_names}' \n | > {attr_weights}") + logger.info("Attribute weights for '%s' \n | > %s", attr_names, attr_weights) # input_audio_lenghts = [os.path.getsize(x["audio_file"]) for x in data_items] @@ -1605,7 +1612,6 @@ def get_data_loader( max_audio_len=config.max_audio_len, phoneme_cache_path=config.phoneme_cache_path, precompute_num_workers=config.precompute_num_workers, - verbose=verbose, tokenizer=self.tokenizer, start_by_longest=config.start_by_longest, ) @@ -1651,13 +1657,16 @@ def get_data_loader( def get_optimizer(self) -> List: """Initiate and return the GAN optimizers based on the config parameters. - It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. + + It returns 2 optimizers in a list. First one is for the discriminator + and the second one is for the generator. + Returns: List: optimizers. """ - # select generator parameters optimizer0 = get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.disc) + # select generator parameters gen_parameters = chain(params for k, params in self.named_parameters() if not k.startswith("disc.")) optimizer1 = get_optimizer( self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters @@ -1712,7 +1721,7 @@ def load_checkpoint( # handle fine-tuning from a checkpoint with additional speakers if hasattr(self, "emb_g") and state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape: num_new_speakers = self.emb_g.weight.shape[0] - state["model"]["emb_g.weight"].shape[0] - print(f" > Loading checkpoint with {num_new_speakers} additional speakers.") + logger.info("Loading checkpoint with %d additional speakers.", num_new_speakers) emb_g = state["model"]["emb_g.weight"] new_row = torch.randn(num_new_speakers, emb_g.shape[1]) emb_g = torch.cat([emb_g, new_row], axis=0) @@ -1769,7 +1778,7 @@ def load_fairseq_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None): """Initiate model from config Args: @@ -1792,7 +1801,7 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict] upsample_rate == effective_hop_length ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" - ap = AudioProcessor.init_from_config(config, verbose=verbose) + ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) language_manager = LanguageManager.init_from_config(config) @@ -1880,16 +1889,18 @@ def onnx_inference(text, text_lengths, scales, sid=None, langid=None): self.forward = _forward if training: self.train() - if not disc is None: + if disc is not None: self.disc = disc def load_onnx(self, model_path: str, cuda=False): import onnxruntime as ort providers = [ - "CPUExecutionProvider" - if cuda is False - else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ( + "CPUExecutionProvider" + if cuda is False + else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ) ] sess_options = ort.SessionOptions() self.onnx_sess = ort.InferenceSession( @@ -1914,9 +1925,9 @@ def inference_onnx(self, x, x_lengths=None, speaker_id=None, language_id=None): dtype=np.float32, ) input_params = {"input": x, "input_lengths": x_lengths, "scales": scales} - if not speaker_id is None: + if speaker_id is not None: input_params["sid"] = torch.tensor([speaker_id]).cpu().numpy() - if not language_id is None: + if language_id is not None: input_params["langid"] = torch.tensor([language_id]).cpu().numpy() audio = self.onnx_sess.run( @@ -1948,8 +1959,7 @@ def __init__( def _create_vocab(self): self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank] self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} - # pylint: disable=unnecessary-comprehension - self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)} + self._id_to_char = dict(enumerate(self.vocab)) @staticmethod def init_from_config(config: Coqpit): @@ -1996,4 +2006,4 @@ def vocab(self, vocab_file): self.blank = self._vocab[0] self.pad = " " self._char_to_id = {s: i for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension - self._id_to_char = {i: s for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension + self._id_to_char = dict(enumerate(self._vocab)) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 83812f377f..ef09344217 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -1,19 +1,23 @@ +import logging import os from dataclasses import dataclass +from pathlib import Path import librosa import torch import torch.nn.functional as F import torchaudio from coqpit import Coqpit +from trainer.io import load_fsspec from TTS.tts.layers.xtts.gpt import GPT from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence -from TTS.tts.layers.xtts.xtts_manager import SpeakerManager, LanguageManager +from TTS.tts.layers.xtts.xtts_manager import LanguageManager, SpeakerManager from TTS.tts.models.base_tts import BaseTTS -from TTS.utils.io import load_fsspec + +logger = logging.getLogger(__name__) init_stream_support() @@ -82,7 +86,7 @@ def load_audio(audiopath, sampling_rate): # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. # '10' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. if torch.any(audio > 10) or not torch.any(audio < 0): - print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + logger.error("Error with %s. Max=%.2f min=%.2f", audiopath, audio.max(), audio.min()) # clip audio invalid values audio.clip_(-1, 1) return audio @@ -197,7 +201,7 @@ class Xtts(BaseTTS): >>> from TTS.tts.configs.xtts_config import XttsConfig >>> from TTS.tts.models.xtts import Xtts >>> config = XttsConfig() - >>> model = Xtts.inif_from_config(config) + >>> model = Xtts.init_from_config(config) >>> model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True) """ @@ -274,7 +278,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = for i in range(0, audio.shape[1], 22050 * chunk_length): audio_chunk = audio[:, i : i + 22050 * chunk_length] - # if the chunk is too short ignore it + # if the chunk is too short ignore it if audio_chunk.size(-1) < 22050 * 0.33: continue @@ -410,12 +414,14 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa if speaker_id is not None: gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values() return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings) - settings.update({ - "gpt_cond_len": config.gpt_cond_len, - "gpt_cond_chunk_len": config.gpt_cond_chunk_len, - "max_ref_len": config.max_ref_len, - "sound_norm_refs": config.sound_norm_refs, - }) + settings.update( + { + "gpt_cond_len": config.gpt_cond_len, + "gpt_cond_chunk_len": config.gpt_cond_chunk_len, + "max_ref_len": config.max_ref_len, + "sound_norm_refs": config.sound_norm_refs, + } + ) return self.full_inference(text, speaker_wav, language, **settings) @torch.inference_mode() @@ -693,12 +699,12 @@ def inference_stream( def forward(self): raise NotImplementedError( - "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + "XTTS has a dedicated trainer, please check the XTTS docs: https://coqui-tts.readthedocs.io/en/latest/models/xtts.html#training" ) def eval_step(self): raise NotImplementedError( - "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + "XTTS has a dedicated trainer, please check the XTTS docs: https://coqui-tts.readthedocs.io/en/latest/models/xtts.html#training" ) @staticmethod @@ -755,12 +761,18 @@ def load_checkpoint( """ model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") - vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth") + if vocab_path is None: + if checkpoint_dir is not None and (Path(checkpoint_dir) / "vocab.json").is_file(): + vocab_path = str(Path(checkpoint_dir) / "vocab.json") + else: + vocab_path = config.model_args.tokenizer_file + + if speaker_file_path is None and checkpoint_dir is not None: + speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") self.language_manager = LanguageManager(config) self.speaker_manager = None - if os.path.exists(speaker_file_path): + if speaker_file_path is not None and os.path.exists(speaker_file_path): self.speaker_manager = SpeakerManager(speaker_file_path) if os.path.exists(vocab_path): @@ -785,5 +797,5 @@ def load_checkpoint( def train_step(self): raise NotImplementedError( - "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + "XTTS has a dedicated trainer, please check the XTTS docs: https://coqui-tts.readthedocs.io/en/latest/models/xtts.html#training" ) diff --git a/TTS/tts/utils/assets/tortoise/tokenizer.json b/TTS/tts/utils/assets/tortoise/tokenizer.json index a128f27305..c2fb44a729 100644 --- a/TTS/tts/utils/assets/tortoise/tokenizer.json +++ b/TTS/tts/utils/assets/tortoise/tokenizer.json @@ -1 +1 @@ -{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} \ No newline at end of file +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index 7b37201f84..7429d0fcc8 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -145,10 +145,9 @@ def average_over_durations(values, durs): return avg -def convert_pad_shape(pad_shape): +def convert_pad_shape(pad_shape: list[list]) -> list: l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape + return [item for sublist in l for item in sublist] def generate_path(duration, mask): diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 1e1836b32c..f134daf58e 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,5 +1,5 @@ import os -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import fsspec import numpy as np @@ -59,7 +59,7 @@ def parse_language_ids_from_config(c: Coqpit) -> Dict: languages.add(dataset["language"]) else: raise ValueError(f"Dataset {dataset['name']} has no language specified.") - return {name: i for i, name in enumerate(sorted(list(languages)))} + return {name: i for i, name in enumerate(sorted(languages))} def set_language_ids_from_config(self, c: Coqpit) -> None: """Set language IDs from config samples. @@ -85,18 +85,18 @@ def save_ids_to_file(self, file_path: str) -> None: self._save_json(file_path, self.name_to_id) @staticmethod - def init_from_config(config: Coqpit) -> "LanguageManager": + def init_from_config(config: Coqpit) -> Optional["LanguageManager"]: """Initialize the language manager from a Coqpit config. Args: config (Coqpit): Coqpit config. """ - language_manager = None if check_config_and_model_args(config, "use_language_embedding", True): if config.get("language_ids_file", None): - language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) - language_manager = LanguageManager(config=config) - return language_manager + return LanguageManager(language_ids_file_path=config.language_ids_file) + # Fall back to parse language IDs from the config + return LanguageManager(config=config) + return None def _set_file_path(path): diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 1f94c5332d..23aa52a8a2 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -193,7 +193,7 @@ def read_embeddings_from_file(file_path: str): embeddings = load_file(file_path) speakers = sorted({x["name"] for x in embeddings.values()}) name_to_id = {name: i for i, name in enumerate(speakers)} - clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys()))) + clip_ids = list(set(clip_name for clip_name in embeddings.keys())) # cache embeddings_by_names for fast inference using a bigger speakers.json embeddings_by_names = {} for x in embeddings.values(): diff --git a/TTS/tts/utils/monotonic_align/setup.py b/TTS/tts/utils/monotonic_align/setup.py deleted file mode 100644 index f22bc6a35a..0000000000 --- a/TTS/tts/utils/monotonic_align/setup.py +++ /dev/null @@ -1,7 +0,0 @@ -# from distutils.core import setup -# from Cython.Build import cythonize -# import numpy - -# setup(name='monotonic_align', -# ext_modules=cythonize("core.pyx"), -# include_dirs=[numpy.get_include()]) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index e49695268d..5229af81c5 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,4 +1,5 @@ import json +import logging import os from typing import Any, Dict, List, Union @@ -10,6 +11,8 @@ from TTS.config import get_from_config_or_model_args_with_default from TTS.tts.utils.managers import EmbeddingManager +logger = logging.getLogger(__name__) + class SpeakerManager(EmbeddingManager): """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information @@ -170,7 +173,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, if c.use_d_vector_file: # restore speaker manager with the embedding file if not os.path.exists(speakers_file): - print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file") + logger.warning( + "speakers.json was not found in %s, trying to use CONFIG.d_vector_file", restore_path + ) if not os.path.exists(c.d_vector_file): raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" @@ -193,16 +198,16 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, speaker_manager.load_ids_from_file(c.speakers_file) if speaker_manager.num_speakers > 0: - print( - " > Speaker manager is loaded with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id) - ) + logger.info( + "Speaker manager is loaded with %d speakers: %s", + speaker_manager.num_speakers, + ", ".join(speaker_manager.name_to_id), ) # save file if path is defined if out_path: out_file_path = os.path.join(out_path, "speakers.json") - print(f" > Saving `speakers.json` to {out_file_path}.") + logger.info("Saving `speakers.json` to %s", out_file_path) if c.use_d_vector_file and c.d_vector_file: speaker_manager.save_embeddings_to_file(out_file_path) else: diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 4bc3befc5b..eddf05db3f 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -207,6 +207,7 @@ class SSIMLoss(_Loss): https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf, DOI:`10.1109/TIP.2003.819861` """ + __constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"] def __init__( diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py index e15830fe8a..cddcb00fd5 100644 --- a/TTS/tts/utils/text/bangla/phonemizer.py +++ b/TTS/tts/utils/text/bangla/phonemizer.py @@ -1,8 +1,11 @@ import re -import bangla -from bnnumerizer import numerize -from bnunicodenormalizer import Normalizer +try: + import bangla + from bnnumerizer import numerize + from bnunicodenormalizer import Normalizer +except ImportError as e: + raise ImportError("Bangla requires: bangla, bnnumerizer, bnunicodenormalizer") from e # initialize bnorm = Normalizer() diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 8fa45ed84b..c622b93c59 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -1,8 +1,11 @@ +import logging from dataclasses import replace from typing import Dict from TTS.tts.configs.shared_configs import CharactersConfig +logger = logging.getLogger(__name__) + def parse_symbols(): return { @@ -87,9 +90,7 @@ def vocab(self, vocab): if vocab is not None: self._vocab = vocab self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)} - self._id_to_char = { - idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension - } + self._id_to_char = dict(enumerate(self._vocab)) @staticmethod def init_from_config(config, **kwargs): @@ -269,9 +270,7 @@ def vocab(self): def vocab(self, vocab): self._vocab = vocab self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} - self._id_to_char = { - idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension - } + self._id_to_char = dict(enumerate(self.vocab)) @property def num_chars(self): @@ -309,14 +308,14 @@ def print_log(self, level: int = 0): Prints the vocabulary in a nice format. """ indent = "\t" * level - print(f"{indent}| > Characters: {self._characters}") - print(f"{indent}| > Punctuations: {self._punctuations}") - print(f"{indent}| > Pad: {self._pad}") - print(f"{indent}| > EOS: {self._eos}") - print(f"{indent}| > BOS: {self._bos}") - print(f"{indent}| > Blank: {self._blank}") - print(f"{indent}| > Vocab: {self.vocab}") - print(f"{indent}| > Num chars: {self.num_chars}") + logger.info("%s| Characters: %s", indent, self._characters) + logger.info("%s| Punctuations: %s", indent, self._punctuations) + logger.info("%s| Pad: %s", indent, self._pad) + logger.info("%s| EOS: %s", indent, self._eos) + logger.info("%s| BOS: %s", indent, self._bos) + logger.info("%s| Blank: %s", indent, self._blank) + logger.info("%s| Vocab: %s", indent, self.vocab) + logger.info("%s| Num chars: %d", indent, self.num_chars) @staticmethod def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py index 727c881e10..e9d62e9d06 100644 --- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py +++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py @@ -1,7 +1,10 @@ from typing import List -import jieba -import pypinyin +try: + import jieba + import pypinyin +except ImportError as e: + raise ImportError("Chinese requires: jieba, pypinyin") from e from .pinyinToPhonemes import PINYIN_DICT diff --git a/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py b/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py index 4e25c3a4c9..89dd654ab1 100644 --- a/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py +++ b/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py @@ -94,25 +94,25 @@ "fo": ["fo"], "fou": ["fou"], "fu": ["fu"], - "ga": ["ga"], - "gai": ["gai"], - "gan": ["gan"], - "gang": ["gɑŋ"], - "gao": ["gaʌ"], - "ge": ["gø"], - "gei": ["gei"], - "gen": ["gœn"], - "geng": ["gÉĩŋ"], - "gong": ["goŋ"], - "gou": ["gou"], - "gu": ["gu"], - "gua": ["gua"], - "guai": ["guai"], - "guan": ["guan"], - "guang": ["guɑŋ"], - "gui": ["guei"], - "gun": ["gun"], - "guo": ["guo"], + "ga": ["ÉĄa"], + "gai": ["ÉĄai"], + "gan": ["ÉĄan"], + "gang": ["ÉĄÉ‘Å‹"], + "gao": ["ÉĄaʌ"], + "ge": ["ÉĄÃ¸"], + "gei": ["ÉĄei"], + "gen": ["ÉĄÅ“n"], + "geng": ["ÉĄÉĩŋ"], + "gong": ["ÉĄoŋ"], + "gou": ["ÉĄou"], + "gu": ["ÉĄu"], + "gua": ["ÉĄua"], + "guai": ["ÉĄuai"], + "guan": ["ÉĄuan"], + "guang": ["ÉĄuɑŋ"], + "gui": ["ÉĄuei"], + "gun": ["ÉĄun"], + "guo": ["ÉĄuo"], "ha": ["xa"], "hai": ["xai"], "han": ["xan"], diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 74d3910b51..fc87025f00 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,7 +1,9 @@ """Set of default text cleaners""" + # TODO: pick the cleaner for languages dynamically import re +from typing import Optional from anyascii import anyascii @@ -16,35 +18,38 @@ _whitespace_re = re.compile(r"\s+") -def expand_abbreviations(text, lang="en"): +def expand_abbreviations(text: str, lang: str = "en") -> str: if lang == "en": _abbreviations = abbreviations_en elif lang == "fr": _abbreviations = abbreviations_fr + else: + msg = f"Language {lang} not supported in expand_abbreviations" + raise ValueError(msg) for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text -def lowercase(text): +def lowercase(text: str) -> str: return text.lower() -def collapse_whitespace(text): +def collapse_whitespace(text: str) -> str: return re.sub(_whitespace_re, " ", text).strip() -def convert_to_ascii(text): +def convert_to_ascii(text: str) -> str: return anyascii(text) -def remove_aux_symbols(text): +def remove_aux_symbols(text: str) -> str: text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text) return text -def replace_symbols(text, lang="en"): - """Replace symbols based on the lenguage tag. +def replace_symbols(text: str, lang: Optional[str] = "en") -> str: + """Replace symbols based on the language tag. Args: text: @@ -76,14 +81,14 @@ def replace_symbols(text, lang="en"): return text -def basic_cleaners(text): +def basic_cleaners(text: str) -> str: """Basic pipeline that lowercases and collapses whitespace without transliteration.""" text = lowercase(text) text = collapse_whitespace(text) return text -def transliteration_cleaners(text): +def transliteration_cleaners(text: str) -> str: """Pipeline for non-English text that transliterates to ASCII.""" # text = convert_to_ascii(text) text = lowercase(text) @@ -91,7 +96,7 @@ def transliteration_cleaners(text): return text -def basic_german_cleaners(text): +def basic_german_cleaners(text: str) -> str: """Pipeline for German text""" text = lowercase(text) text = collapse_whitespace(text) @@ -99,7 +104,7 @@ def basic_german_cleaners(text): # TODO: elaborate it -def basic_turkish_cleaners(text): +def basic_turkish_cleaners(text: str) -> str: """Pipeline for Turkish text""" text = text.replace("I", "Äą") text = lowercase(text) @@ -107,7 +112,7 @@ def basic_turkish_cleaners(text): return text -def english_cleaners(text): +def english_cleaners(text: str) -> str: """Pipeline for English text, including number and abbreviation expansion.""" # text = convert_to_ascii(text) text = lowercase(text) @@ -120,8 +125,12 @@ def english_cleaners(text): return text -def phoneme_cleaners(text): - """Pipeline for phonemes mode, including number and abbreviation expansion.""" +def phoneme_cleaners(text: str) -> str: + """Pipeline for phonemes mode, including number and abbreviation expansion. + + NB: This cleaner converts numbers into English words, for other languages + use multilingual_phoneme_cleaners(). + """ text = en_normalize_numbers(text) text = expand_abbreviations(text) text = replace_symbols(text) @@ -130,7 +139,15 @@ def phoneme_cleaners(text): return text -def french_cleaners(text): +def multilingual_phoneme_cleaners(text: str) -> str: + """Pipeline for phonemes mode, including number and abbreviation expansion.""" + text = replace_symbols(text, lang=None) + text = remove_aux_symbols(text) + text = collapse_whitespace(text) + return text + + +def french_cleaners(text: str) -> str: """Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" text = expand_abbreviations(text, lang="fr") text = lowercase(text) @@ -140,7 +157,7 @@ def french_cleaners(text): return text -def portuguese_cleaners(text): +def portuguese_cleaners(text: str) -> str: """Basic pipeline for Portuguese text. There is no need to expand abbreviation and numbers, phonemizer already does that""" text = lowercase(text) @@ -156,7 +173,7 @@ def chinese_mandarin_cleaners(text: str) -> str: return text -def multilingual_cleaners(text): +def multilingual_cleaners(text: str) -> str: """Pipeline for multilingual text""" text = lowercase(text) text = replace_symbols(text, lang=None) @@ -165,7 +182,7 @@ def multilingual_cleaners(text): return text -def no_cleaners(text): +def no_cleaners(text: str) -> str: # remove newline characters text = text.replace("\n", "") return text diff --git a/TTS/tts/utils/text/japanese/phonemizer.py b/TTS/tts/utils/text/japanese/phonemizer.py index c3111067e1..30072ae501 100644 --- a/TTS/tts/utils/text/japanese/phonemizer.py +++ b/TTS/tts/utils/text/japanese/phonemizer.py @@ -350,8 +350,8 @@ def hira2kata(text: str) -> str: return text.replace("う゛", "ヴ") -_SYMBOL_TOKENS = set(list("ãƒģ、。īŧŸīŧ")) -_NO_YOMI_TOKENS = set(list("「」『』―īŧˆīŧ‰īŧģīŧŊ[] â€Ļ")) +_SYMBOL_TOKENS = set("ãƒģ、。īŧŸīŧ") +_NO_YOMI_TOKENS = set("「」『』―īŧˆīŧ‰īŧģīŧŊ[] â€Ļ") _TAGGER = MeCab.Tagger() diff --git a/TTS/tts/utils/text/korean/phonemizer.py b/TTS/tts/utils/text/korean/phonemizer.py index 2c69217c40..dde039b0f5 100644 --- a/TTS/tts/utils/text/korean/phonemizer.py +++ b/TTS/tts/utils/text/korean/phonemizer.py @@ -1,4 +1,7 @@ -from jamo import hangul_to_jamo +try: + from jamo import hangul_to_jamo +except ImportError as e: + raise ImportError("Korean requires: g2pkk, jamo") from e from TTS.tts.utils.text.korean.korean import normalize diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index f9a0340c55..fdf62bab3d 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -1,18 +1,29 @@ -from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer from TTS.tts.utils.text.phonemizers.base import BasePhonemizer from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut -from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer -from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer + +try: + from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer +except ImportError: + BN_Phonemizer = None try: from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer except ImportError: JA_JP_Phonemizer = None - pass -PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)} +try: + from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer +except ImportError: + KO_KR_Phonemizer = None + +try: + from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer +except ImportError: + ZH_CN_Phonemizer = None + +PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut)} ESPEAK_LANGS = list(ESpeak.supported_languages().keys()) @@ -33,17 +44,21 @@ # Force default for some languages DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] -DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() -DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name() -DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name() -# JA phonemizer has deal breaking dependencies like MeCab for some systems. -# So we only have it when we have it. +if BN_Phonemizer is not None: + PHONEMIZERS[BN_Phonemizer.name()] = BN_Phonemizer + DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name() if JA_JP_Phonemizer is not None: PHONEMIZERS[JA_JP_Phonemizer.name()] = JA_JP_Phonemizer DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() +if KO_KR_Phonemizer is not None: + PHONEMIZERS[KO_KR_Phonemizer.name()] = KO_KR_Phonemizer + DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name() +if ZH_CN_Phonemizer is not None: + PHONEMIZERS[ZH_CN_Phonemizer.name()] = ZH_CN_Phonemizer + DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: @@ -61,14 +76,20 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: if name == "gruut": return Gruut(**kwargs) if name == "zh_cn_phonemizer": + if ZH_CN_Phonemizer is None: + raise ValueError("You need to install ZH phonemizer dependencies. Try `pip install coqui-tts[zh]`.") return ZH_CN_Phonemizer(**kwargs) if name == "ja_jp_phonemizer": if JA_JP_Phonemizer is None: - raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.") + raise ValueError("You need to install JA phonemizer dependencies. Try `pip install coqui-tts[ja]`.") return JA_JP_Phonemizer(**kwargs) if name == "ko_kr_phonemizer": + if KO_KR_Phonemizer is None: + raise ValueError("You need to install KO phonemizer dependencies. Try `pip install coqui-tts[ko]`.") return KO_KR_Phonemizer(**kwargs) if name == "bn_phonemizer": + if BN_Phonemizer is None: + raise ValueError("You need to install BN phonemizer dependencies. Try `pip install coqui-tts[bn]`.") return BN_Phonemizer(**kwargs) if name == "be_phonemizer": return BEL_Phonemizer(**kwargs) diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 4fc7987415..5e701df458 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -1,8 +1,11 @@ import abc +import logging from typing import List, Tuple from TTS.tts.utils.text.punctuation import Punctuation +logger = logging.getLogger(__name__) + class BasePhonemizer(abc.ABC): """Base phonemizer class @@ -136,5 +139,5 @@ def phonemize(self, text: str, separator="|", language: str = None) -> str: # p def print_logs(self, level: int = 0): indent = "\t" * level - print(f"{indent}| > phoneme language: {self.language}") - print(f"{indent}| > phoneme backend: {self.name()}") + logger.info("%s| phoneme language: %s", indent, self.language) + logger.info("%s| phoneme backend: %s", indent, self.name()) diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index 328e52f369..a15df716e7 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -1,15 +1,21 @@ +"""Wrapper to call the espeak/espeak-ng phonemizer.""" + import logging import re import subprocess -from typing import Dict, List +import tempfile +from pathlib import Path +from typing import Optional from packaging.version import Version from TTS.tts.utils.text.phonemizers.base import BasePhonemizer from TTS.tts.utils.text.punctuation import Punctuation +logger = logging.getLogger(__name__) + -def is_tool(name): +def _is_tool(name) -> bool: from shutil import which return which(name) is not None @@ -20,23 +26,25 @@ def is_tool(name): espeak_version_pattern = re.compile(r"text-to-speech:\s(?P\d+\.\d+(\.\d+)?)") -def get_espeak_version(): +def get_espeak_version() -> str: + """Return version of the `espeak` binary.""" output = subprocess.getoutput("espeak --version") match = espeak_version_pattern.search(output) return match.group("version") -def get_espeakng_version(): +def get_espeakng_version() -> str: + """Return version of the `espeak-ng` binary.""" output = subprocess.getoutput("espeak-ng --version") return output.split()[3] # priority: espeakng > espeak -if is_tool("espeak-ng"): +if _is_tool("espeak-ng"): _DEF_ESPEAK_LIB = "espeak-ng" _DEF_ESPEAK_VER = get_espeakng_version() -elif is_tool("espeak"): +elif _is_tool("espeak"): _DEF_ESPEAK_LIB = "espeak" _DEF_ESPEAK_VER = get_espeak_version() else: @@ -44,7 +52,7 @@ def get_espeakng_version(): _DEF_ESPEAK_VER = None -def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]: +def _espeak_exe(espeak_lib: str, args: list) -> list[str]: """Run espeak with the given arguments.""" cmd = [ espeak_lib, @@ -53,35 +61,22 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]: "1", # UTF8 text encoding ] cmd.extend(args) - logging.debug("espeakng: executing %s", repr(cmd)) - - with subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) as p: - res = iter(p.stdout.readline, b"") - if not sync: - p.stdout.close() - if p.stderr: - p.stderr.close() - if p.stdin: - p.stdin.close() - return res - res2 = [] - for line in res: - res2.append(line) - p.stdout.close() - if p.stderr: - p.stderr.close() - if p.stdin: - p.stdin.close() - p.wait() - return res2 + logger.debug("Executing: %s", repr(cmd)) + + p = subprocess.run(cmd, capture_output=True, encoding="utf8", check=True) + for line in p.stderr.strip().split("\n"): + if line.strip() != "": + logger.warning("%s: %s", espeak_lib, line.strip()) + res = [] + for line in p.stdout.strip().split("\n"): + if line.strip() != "": + logger.debug("%s: %s", espeak_lib, line.strip()) + res.append(line.strip()) + return res class ESpeak(BasePhonemizer): - """ESpeak wrapper calling `espeak` or `espeak-ng` from the command-line the perform G2P + """Wrapper calling `espeak` or `espeak-ng` from the command-line to perform G2P. Args: language (str): @@ -106,13 +101,17 @@ class ESpeak(BasePhonemizer): """ - _ESPEAK_LIB = _DEF_ESPEAK_LIB - _ESPEAK_VER = _DEF_ESPEAK_VER - - def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True): - if self._ESPEAK_LIB is None: - raise Exception(" [!] No espeak backend found. Install espeak-ng or espeak to your system.") - self.backend = self._ESPEAK_LIB + def __init__( + self, + language: str, + backend: Optional[str] = None, + punctuations: str = Punctuation.default_puncs(), + keep_puncs: bool = True, + ): + if _DEF_ESPEAK_LIB is None: + msg = "[!] No espeak backend found. Install espeak-ng or espeak to your system." + raise FileNotFoundError(msg) + self.backend = _DEF_ESPEAK_LIB # band-aid for backwards compatibility if language == "en": @@ -125,35 +124,37 @@ def __init__(self, language: str, backend=None, punctuations=Punctuation.default self.backend = backend @property - def backend(self): + def backend(self) -> str: return self._ESPEAK_LIB @property - def backend_version(self): + def backend_version(self) -> str: return self._ESPEAK_VER @backend.setter - def backend(self, backend): + def backend(self, backend: str) -> None: if backend not in ["espeak", "espeak-ng"]: - raise Exception("Unknown backend: %s" % backend) + msg = f"Unknown backend: {backend}" + raise ValueError(msg) self._ESPEAK_LIB = backend self._ESPEAK_VER = get_espeakng_version() if backend == "espeak-ng" else get_espeak_version() def auto_set_espeak_lib(self) -> None: - if is_tool("espeak-ng"): + if _is_tool("espeak-ng"): self._ESPEAK_LIB = "espeak-ng" self._ESPEAK_VER = get_espeakng_version() - elif is_tool("espeak"): + elif _is_tool("espeak"): self._ESPEAK_LIB = "espeak" self._ESPEAK_VER = get_espeak_version() else: - raise Exception("Cannot set backend automatically. espeak-ng or espeak not found") + msg = "Cannot set backend automatically. espeak-ng or espeak not found" + raise FileNotFoundError(msg) @staticmethod - def name(): + def name() -> str: return "espeak" - def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str: + def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False) -> str: """Convert input text to phonemes. Args: @@ -185,12 +186,15 @@ def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str: if tie: args.append("--tie=%s" % tie) - args.append(text) + tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8") + tmp.write(text) + tmp.close() + args.append("-f") + args.append(tmp.name) + # compute phonemes phonemes = "" - for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): - logging.debug("line: %s", repr(line)) - ph_decoded = line.decode("utf8").strip() + for line in _espeak_exe(self.backend, args): # espeak: # version 1.48.15: " p_Éš_ˈaÉĒ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" # espeak-ng: @@ -200,16 +204,17 @@ def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str: # "sɛʁtˈɛĖƒ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaÉĄ də- lˈɑĖƒÉĄ." # phonemize needs to remove the language flags of the returned text: # "sɛʁtˈɛĖƒ mˈo kɔm fˈʊtbɔːl ʒenˈɛʁ de- flˈaÉĄ də- lˈɑĖƒÉĄ." - ph_decoded = re.sub(r"\(.+?\)", "", ph_decoded) + ph_decoded = re.sub(r"\(.+?\)", "", line) phonemes += ph_decoded.strip() + Path(tmp.name).unlink() return phonemes.replace("_", separator) - def _phonemize(self, text, separator=None): + def _phonemize(self, text: str, separator: str = "") -> str: return self.phonemize_espeak(text, separator, tie=False) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict[str, str]: """Get a dictionary of supported languages. Returns: @@ -219,16 +224,12 @@ def supported_languages() -> Dict: return {} args = ["--voices"] langs = {} - count = 0 - for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True): - line = line.decode("utf8").strip() + for count, line in enumerate(_espeak_exe(_DEF_ESPEAK_LIB, args)): if count > 0: cols = line.split() lang_code = cols[1] lang_name = cols[3] langs[lang_code] = lang_name - logging.debug("line: %s", repr(line)) - count += 1 return langs def version(self) -> str: @@ -237,16 +238,12 @@ def version(self) -> str: Returns: str: Version of the used backend. """ - args = ["--version"] - for line in _espeak_exe(self.backend, args, sync=True): - version = line.decode("utf8").strip().split()[2] - logging.debug("line: %s", repr(line)) - return version + return self.backend_version @classmethod - def is_available(cls): - """Return true if ESpeak is available else false""" - return is_tool("espeak") or is_tool("espeak-ng") + def is_available(cls) -> bool: + """Return true if ESpeak is available else false.""" + return _is_tool("espeak") or _is_tool("espeak-ng") if __name__ == "__main__": diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py index 62a9c39322..1a9e98b091 100644 --- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py @@ -1,7 +1,10 @@ +import logging from typing import Dict, List from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name +logger = logging.getLogger(__name__) + class MultiPhonemizer: """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages @@ -46,8 +49,8 @@ def supported_languages(self) -> List: def print_logs(self, level: int = 0): indent = "\t" * level - print(f"{indent}| > phoneme language: {self.supported_languages()}") - print(f"{indent}| > phoneme backend: {self.name()}") + logger.info("%s| phoneme language: %s", indent, self.supported_languages()) + logger.info("%s| phoneme backend: %s", indent, self.name()) # if __name__ == "__main__": diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index b7faf86e8a..f653cdf13f 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -1,3 +1,4 @@ +import logging from typing import Callable, Dict, List, Union from TTS.tts.utils.text import cleaners @@ -6,6 +7,8 @@ from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer from TTS.utils.generic_utils import get_import_path, import_class +logger = logging.getLogger(__name__) + class TTSTokenizer: """🐸TTS tokenizer to convert input characters to token IDs and back. @@ -73,8 +76,8 @@ def encode(self, text: str) -> List[int]: # discard but store not found characters if char not in self.not_found_characters: self.not_found_characters.append(char) - print(text) - print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.") + logger.warning(text) + logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char)) return token_ids def decode(self, token_ids: List[int]) -> str: @@ -104,10 +107,13 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: 5. Text to token IDs """ # TODO: text cleaner should pick the right routine based on the language + logger.debug("Tokenizer input text: %s", text) if self.text_cleaner is not None: text = self.text_cleaner(text) + logger.debug("Cleaned text: %s", text) if self.use_phonemes: text = self.phonemizer.phonemize(text, separator="", language=language) + logger.debug("Phonemes: %s", text) text = self.encode(text) if self.add_blank: text = self.intersperse_blank_char(text, True) @@ -135,16 +141,16 @@ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool def print_logs(self, level: int = 0): indent = "\t" * level - print(f"{indent}| > add_blank: {self.add_blank}") - print(f"{indent}| > use_eos_bos: {self.use_eos_bos}") - print(f"{indent}| > use_phonemes: {self.use_phonemes}") + logger.info("%s| add_blank: %s", indent, self.add_blank) + logger.info("%s| use_eos_bos: %s", indent, self.use_eos_bos) + logger.info("%s| use_phonemes: %s", indent, self.use_phonemes) if self.use_phonemes: - print(f"{indent}| > phonemizer:") + logger.info("%s| phonemizer:", indent) self.phonemizer.print_logs(level + 1) if len(self.not_found_characters) > 0: - print(f"{indent}| > {len(self.not_found_characters)} not found characters:") + logger.info("%s| %d characters not found:", indent, len(self.not_found_characters)) for char in self.not_found_characters: - print(f"{indent}| > {char}") + logger.info("%s| %s", indent, char) @staticmethod def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index af88569fc3..4a8972480c 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,3 +1,4 @@ +import logging from io import BytesIO from typing import Tuple @@ -7,6 +8,8 @@ import soundfile as sf from librosa import magphase, pyin +logger = logging.getLogger(__name__) + # For using kwargs # pylint: disable=unused-argument @@ -222,7 +225,7 @@ def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray S_complex = np.abs(spec).astype(complex) y = istft(y=S_complex * angles, **kwargs) if not np.isfinite(y).all(): - print(" [!] Waveform is not finite everywhere. Skipping the GL.") + logger.warning("Waveform is not finite everywhere. Skipping the GL.") return np.array([0.0]) for _ in range(num_iter): angles = np.exp(1j * np.angle(stft(y=y, **kwargs))) diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index c53bad562e..680e29debc 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,3 +1,4 @@ +import logging from io import BytesIO from typing import Dict, Tuple @@ -26,6 +27,8 @@ volume_norm, ) +logger = logging.getLogger(__name__) + # pylint: disable=too-many-public-methods @@ -132,10 +135,6 @@ class AudioProcessor(object): stats_path (str, optional): Path to the computed stats file. Defaults to None. - - verbose (bool, optional): - enable/disable logging. Defaults to True. - """ def __init__( @@ -172,7 +171,6 @@ def __init__( do_rms_norm=False, db_level=None, stats_path=None, - verbose=True, **_, ): # setup class attributed @@ -228,10 +226,9 @@ def __init__( self.win_length <= self.fft_size ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" members = vars(self) - if verbose: - print(" > Setting up Audio Processor...") - for key, value in members.items(): - print(" | > {}:{}".format(key, value)) + logger.info("Setting up Audio Processor...") + for key, value in members.items(): + logger.info(" | %s: %s", key, value) # create spectrogram utils self.mel_basis = build_mel_basis( sample_rate=self.sample_rate, @@ -250,10 +247,10 @@ def __init__( self.symmetric_norm = None @staticmethod - def init_from_config(config: "Coqpit", verbose=True): + def init_from_config(config: "Coqpit"): if "audio" in config: - return AudioProcessor(verbose=verbose, **config.audio) - return AudioProcessor(verbose=verbose, **config) + return AudioProcessor(**config.audio) + return AudioProcessor(**config) ### normalization ### def normalize(self, S: np.ndarray) -> np.ndarray: @@ -595,7 +592,7 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: try: x = self.trim_silence(x) except ValueError: - print(f" [!] File cannot be trimmed for silence - {filename}") + logger.exception("File cannot be trimmed for silence - %s", filename) if self.do_sound_norm: x = self.sound_norm(x) if self.do_rms_norm: diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py index fd40ebb048..632969c51a 100644 --- a/TTS/utils/audio/torch_transforms.py +++ b/TTS/utils/audio/torch_transforms.py @@ -119,17 +119,19 @@ def __call__(self, x): padding = int((self.n_fft - self.hop_length) / 2) x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") # B x D x T x 2 - o = torch.stft( - x.squeeze(1), - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=self.normalized, - onesided=True, - return_complex=False, + o = torch.view_as_real( + torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=self.normalized, + onesided=True, + return_complex=True, + ) ) M = o[:, :, :, 0] P = o[:, :, :, 1] diff --git a/TTS/utils/download.py b/TTS/utils/download.py index 3f06b57824..e94b1d68c8 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -12,6 +12,8 @@ from torch.utils.model_zoo import tqdm +logger = logging.getLogger(__name__) + def stream_url( url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True @@ -36,13 +38,16 @@ def stream_url( if start_byte: req.headers["Range"] = "bytes={}-".format(start_byte) - with urllib.request.urlopen(req) as upointer, tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=url_size, - disable=not progress_bar, - ) as pbar: + with ( + urllib.request.urlopen(req) as upointer, + tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=url_size, + disable=not progress_bar, + ) as pbar, + ): num_bytes = 0 while True: chunk = upointer.read(block_size) @@ -146,20 +151,20 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo Returns: list: List of paths to extracted files even if not overwritten. """ - + logger.info("Extracting archive file...") if to_path is None: to_path = os.path.dirname(from_path) try: with tarfile.open(from_path, "r") as tar: - logging.info("Opened tar file %s.", from_path) + logger.info("Opened tar file %s.", from_path) files = [] for file_ in tar: # type: Any file_path = os.path.join(to_path, file_.name) if file_.isfile(): files.append(file_path) if os.path.exists(file_path): - logging.info("%s already extracted.", file_path) + logger.info("%s already extracted.", file_path) if not overwrite: continue tar.extract(file_, to_path) @@ -169,12 +174,12 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo try: with zipfile.ZipFile(from_path, "r") as zfile: - logging.info("Opened zip file %s.", from_path) + logger.info("Opened zip file %s.", from_path) files = zfile.namelist() for file_ in files: file_path = os.path.join(to_path, file_) if os.path.exists(file_path): - logging.info("%s already extracted.", file_path) + logger.info("%s already extracted.", file_path) if not overwrite: continue zfile.extract(file_, to_path) @@ -198,9 +203,10 @@ def download_kaggle_dataset(dataset_path: str, dataset_name: str, output_path: s import kaggle # pylint: disable=import-outside-toplevel kaggle.api.authenticate() - print(f"""\nDownloading {dataset_name}...""") + logger.info("Downloading %s...", dataset_name) kaggle.api.dataset_download_files(dataset_path, path=data_path, unzip=True) except OSError: - print( - f"""[!] in order to download kaggle datasets, you need to have a kaggle api token stored in your {os.path.join(expanduser('~'), '.kaggle/kaggle.json')}""" + logger.exception( + "In order to download kaggle datasets, you need to have a kaggle api token stored in your %s", + os.path.join(expanduser("~"), ".kaggle/kaggle.json"), ) diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py index 104dc7b94e..8705873982 100644 --- a/TTS/utils/downloaders.py +++ b/TTS/utils/downloaders.py @@ -1,8 +1,11 @@ +import logging import os from typing import Optional from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive +logger = logging.getLogger(__name__) + def download_ljspeech(path: str): """Download and extract LJSpeech dataset @@ -15,7 +18,6 @@ def download_ljspeech(path: str): download_url(url, path) basename = os.path.basename(url) archive = os.path.join(path, basename) - print(" > Extracting archive file...") extract_archive(archive) @@ -35,7 +37,6 @@ def download_vctk(path: str, use_kaggle: Optional[bool] = False): download_url(url, path) basename = os.path.basename(url) archive = os.path.join(path, basename) - print(" > Extracting archive file...") extract_archive(archive) @@ -71,19 +72,17 @@ def download_libri_tts(path: str, subset: Optional[str] = "all"): os.makedirs(path, exist_ok=True) if subset == "all": for sub, val in subset_dict.items(): - print(f" > Downloading {sub}...") + logger.info("Downloading %s...", sub) download_url(val, path) basename = os.path.basename(val) archive = os.path.join(path, basename) - print(" > Extracting archive file...") extract_archive(archive) - print(" > All subsets downloaded") + logger.info("All subsets downloaded") else: url = subset_dict[subset] download_url(url, path) basename = os.path.basename(url) archive = os.path.join(path, basename) - print(" > Extracting archive file...") extract_archive(archive) @@ -98,7 +97,6 @@ def download_thorsten_de(path: str): download_url(url, path) basename = os.path.basename(url) archive = os.path.join(path, basename) - print(" > Extracting archive file...") extract_archive(archive) @@ -122,5 +120,4 @@ def download_mailabs(path: str, language: str = "english"): download_url(url, path) basename = os.path.basename(url) archive = os.path.join(path, basename) - print(" > Extracting archive file...") extract_archive(archive) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 9730576239..91f8844262 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -2,86 +2,11 @@ import datetime import importlib import logging -import os import re -import subprocess -import sys from pathlib import Path -from typing import Dict - -import fsspec -import torch - - -def to_cuda(x: torch.Tensor) -> torch.Tensor: - if x is None: - return None - if torch.is_tensor(x): - x = x.contiguous() - if torch.cuda.is_available(): - x = x.cuda(non_blocking=True) - return x - - -def get_cuda(): - use_cuda = torch.cuda.is_available() - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - return use_cuda, device - - -def get_git_branch(): - try: - out = subprocess.check_output(["git", "branch"]).decode("utf8") - current = next(line for line in out.split("\n") if line.startswith("*")) - current.replace("* ", "") - except subprocess.CalledProcessError: - current = "inside_docker" - except FileNotFoundError: - current = "unknown" - except StopIteration: - current = "unknown" - return current - - -def get_commit_hash(): - """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script""" - # try: - # subprocess.check_output(['git', 'diff-index', '--quiet', - # 'HEAD']) # Verify client is clean - # except: - # raise RuntimeError( - # " !! Commit before training to get the commit hash.") - try: - commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip() - # Not copying .git folder into docker container - except (subprocess.CalledProcessError, FileNotFoundError): - commit = "0000000" - return commit - - -def get_experiment_folder_path(root_path, model_name): - """Get an experiment folder path with the current date and time""" - date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - commit_hash = get_commit_hash() - output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash) - return output_folder - - -def remove_experiment_folder(experiment_path): - """Check folder if there is a checkpoint, otherwise remove the folder""" - fs = fsspec.get_mapper(experiment_path).fs - checkpoint_files = fs.glob(experiment_path + "/*.pth") - if not checkpoint_files: - if fs.exists(experiment_path): - fs.rm(experiment_path, recursive=True) - print(" ! Run is removed from {}".format(experiment_path)) - else: - print(" ! Run is kept in {}".format(experiment_path)) - - -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) +from typing import Dict, Optional + +logger = logging.getLogger(__name__) def to_camel(text): @@ -126,33 +51,11 @@ def get_import_path(obj: object) -> str: return ".".join([type(obj).__module__, type(obj).__name__]) -def get_user_data_dir(appname): - TTS_HOME = os.environ.get("TTS_HOME") - XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME") - if TTS_HOME is not None: - ans = Path(TTS_HOME).expanduser().resolve(strict=False) - elif XDG_DATA_HOME is not None: - ans = Path(XDG_DATA_HOME).expanduser().resolve(strict=False) - elif sys.platform == "win32": - import winreg # pylint: disable=import-outside-toplevel - - key = winreg.OpenKey( - winreg.HKEY_CURRENT_USER, r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders" - ) - dir_, _ = winreg.QueryValueEx(key, "Local AppData") - ans = Path(dir_).resolve(strict=False) - elif sys.platform == "darwin": - ans = Path("~/Library/Application Support/").expanduser() - else: - ans = Path.home().joinpath(".local/share") - return ans.joinpath(appname) - - def set_init_dict(model_dict, checkpoint_state, c): # Partial initialization: if there is a mismatch with new and old layer, it is skipped. for k, v in checkpoint_state.items(): if k not in model_dict: - print(" | > Layer missing in the model definition: {}".format(k)) + logger.warning("Layer missing in the model finition %s", k) # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} # 2. filter out different size layers @@ -163,7 +66,7 @@ def set_init_dict(model_dict, checkpoint_state, c): pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} # 4. overwrite entries in the existing state dict model_dict.update(pretrained_dict) - print(" | > {} / {} layers are restored.".format(len(pretrained_dict), len(model_dict))) + logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict)) return model_dict @@ -184,54 +87,43 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: return kwargs -class KeepAverage: - def __init__(self): - self.avg_values = {} - self.iters = {} +def get_timestamp() -> str: + return datetime.datetime.now().strftime("%y%m%d-%H%M%S") - def __getitem__(self, key): - return self.avg_values[key] - def items(self): - return self.avg_values.items() +class ConsoleFormatter(logging.Formatter): + """Custom formatter that prints logging.INFO messages without the level name. - def add_value(self, name, init_val=0, init_iter=0): - self.avg_values[name] = init_val - self.iters[name] = init_iter + Source: https://stackoverflow.com/a/62488520 + """ - def update_value(self, name, value, weighted_avg=False): - if name not in self.avg_values: - # add value if not exist before - self.add_value(name, init_val=value) + def format(self, record): + if record.levelno == logging.INFO: + self._style._fmt = "%(message)s" else: - # else update existing value - if weighted_avg: - self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value - self.iters[name] += 1 - else: - self.avg_values[name] = self.avg_values[name] * self.iters[name] + value - self.iters[name] += 1 - self.avg_values[name] /= self.iters[name] - - def add_values(self, name_dict): - for key, value in name_dict.items(): - self.add_value(key, init_val=value) - - def update_values(self, value_dict): - for key, value in value_dict.items(): - self.update_value(key, value) - - -def get_timestamp(): - return datetime.now().strftime("%y%m%d-%H%M%S") - - -def setup_logger(logger_name, root, phase, level=logging.INFO, screen=False, tofile=False): + self._style._fmt = "%(levelname)s: %(message)s" + return super().format(record) + + +def setup_logger( + logger_name: str, + level: int = logging.INFO, + *, + formatter: Optional[logging.Formatter] = None, + screen: bool = False, + tofile: bool = False, + log_dir: str = "logs", + log_name: str = "log", +) -> None: lg = logging.getLogger(logger_name) - formatter = logging.Formatter("%(asctime)s.%(msecs)03d - %(levelname)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S") + if formatter is None: + formatter = logging.Formatter( + "%(asctime)s.%(msecs)03d - %(levelname)-8s - %(name)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S" + ) lg.setLevel(level) if tofile: - log_file = os.path.join(root, phase + "_{}.log".format(get_timestamp())) + Path(log_dir).mkdir(exist_ok=True, parents=True) + log_file = Path(log_dir) / f"{log_name}_{get_timestamp()}.log" fh = logging.FileHandler(log_file, mode="w") fh.setFormatter(formatter) lg.addHandler(fh) diff --git a/TTS/utils/io.py b/TTS/utils/io.py deleted file mode 100644 index 3107ba661b..0000000000 --- a/TTS/utils/io.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import pickle as pickle_tts -from typing import Any, Callable, Dict, Union - -import fsspec -import torch - -from TTS.utils.generic_utils import get_user_data_dir - - -class RenamingUnpickler(pickle_tts.Unpickler): - """Overload default pickler to solve module renaming problem""" - - def find_class(self, module, name): - return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name) - - -class AttrDict(dict): - """A custom dict which converts dict keys - to class attributes""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.__dict__ = self - - -def load_fsspec( - path: str, - map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, - cache: bool = True, - **kwargs, -) -> Any: - """Like torch.load but can load from other locations (e.g. s3:// , gs://). - - Args: - path: Any path or url supported by fsspec. - map_location: torch.device or str. - cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True. - **kwargs: Keyword arguments forwarded to torch.load. - - Returns: - Object stored in path. - """ - is_local = os.path.isdir(path) or os.path.isfile(path) - if cache and not is_local: - with fsspec.open( - f"filecache::{path}", - filecache={"cache_storage": str(get_user_data_dir("tts_cache"))}, - mode="rb", - ) as f: - return torch.load(f, map_location=map_location, **kwargs) - else: - with fsspec.open(path, "rb") as f: - return torch.load(f, map_location=map_location, **kwargs) - - -def load_checkpoint( - model, checkpoint_path, use_cuda=False, eval=False, cache=False -): # pylint: disable=redefined-builtin - try: - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache) - model.load_state_dict(state["model"]) - if use_cuda: - model.cuda() - if eval: - model.eval() - return model, state diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 3a527f4609..fb5071d9b0 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -1,18 +1,21 @@ import json +import logging import os import re import tarfile import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Dict, List, Tuple +from typing import Dict, Tuple import fsspec import requests from tqdm import tqdm +from trainer.io import get_user_data_dir from TTS.config import load_config, read_json_with_comments -from TTS.utils.generic_utils import get_user_data_dir + +logger = logging.getLogger(__name__) LICENSE_URLS = { "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", @@ -40,13 +43,11 @@ class ModelManager(object): models_file (str): path to .model.json file. Defaults to None. output_prefix (str): prefix to `tts` to download models. Defaults to None progress_bar (bool): print a progress bar when donwloading a file. Defaults to False. - verbose (bool): print info. Defaults to True. """ - def __init__(self, models_file=None, output_prefix=None, progress_bar=False, verbose=True): + def __init__(self, models_file=None, output_prefix=None, progress_bar=False): super().__init__() self.progress_bar = progress_bar - self.verbose = verbose if output_prefix is None: self.output_prefix = get_user_data_dir("tts") else: @@ -68,19 +69,16 @@ def read_models_file(self, file_path): self.models_dict = read_json_with_comments(file_path) def _list_models(self, model_type, model_count=0): - if self.verbose: - print("\n Name format: type/language/dataset/model") + logger.info("") + logger.info("Name format: type/language/dataset/model") model_list = [] for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: for model in self.models_dict[model_type][lang][dataset]: model_full_name = f"{model_type}--{lang}--{dataset}--{model}" - output_path = os.path.join(self.output_prefix, model_full_name) - if self.verbose: - if os.path.exists(output_path): - print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]") - else: - print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}") + output_path = Path(self.output_prefix) / model_full_name + downloaded = " [already downloaded]" if output_path.is_dir() else "" + logger.info(" %2d: %s/%s/%s/%s%s", model_count, model_type, lang, dataset, model, downloaded) model_list.append(f"{model_type}/{lang}/{dataset}/{model}") model_count += 1 return model_list @@ -99,21 +97,36 @@ def list_models(self): models_name_list.extend(model_list) return models_name_list + def log_model_details(self, model_type, lang, dataset, model): + logger.info("Model type: %s", model_type) + logger.info("Language supported: %s", lang) + logger.info("Dataset used: %s", dataset) + logger.info("Model name: %s", model) + if "description" in self.models_dict[model_type][lang][dataset][model]: + logger.info("Description: %s", self.models_dict[model_type][lang][dataset][model]["description"]) + else: + logger.info("Description: coming soon") + if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]: + logger.info( + "Default vocoder: %s", + self.models_dict[model_type][lang][dataset][model]["default_vocoder"], + ) + def model_info_by_idx(self, model_query): - """Print the description of the model from .models.json file using model_idx + """Print the description of the model from .models.json file using model_query_idx Args: - model_query (str): / + model_query (str): / """ model_name_list = [] model_type, model_query_idx = model_query.split("/") try: model_query_idx = int(model_query_idx) if model_query_idx <= 0: - print("> model_query_idx should be a positive integer!") + logger.error("model_query_idx [%d] should be a positive integer!", model_query_idx) return - except: - print("> model_query_idx should be an integer!") + except (TypeError, ValueError): + logger.error("model_query_idx [%s] should be an integer!", model_query_idx) return model_count = 0 if model_type in self.models_dict: @@ -123,22 +136,13 @@ def model_info_by_idx(self, model_query): model_name_list.append(f"{model_type}/{lang}/{dataset}/{model}") model_count += 1 else: - print(f"> model_type {model_type} does not exist in the list.") + logger.error("Model type %s does not exist in the list.", model_type) return if model_query_idx > model_count: - print(f"model query idx exceeds the number of available models [{model_count}] ") + logger.error("model_query_idx exceeds the number of available models [%d]", model_count) else: model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/") - print(f"> model type : {model_type}") - print(f"> language supported : {lang}") - print(f"> dataset used : {dataset}") - print(f"> model name : {model}") - if "description" in self.models_dict[model_type][lang][dataset][model]: - print(f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}") - else: - print("> description : coming soon") - if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]: - print(f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}") + self.log_model_details(model_type, lang, dataset, model) def model_info_by_full_name(self, model_query_name): """Print the description of the model from .models.json file using model_full_name @@ -147,32 +151,19 @@ def model_info_by_full_name(self, model_query_name): model_query_name (str): Format is /// """ model_type, lang, dataset, model = model_query_name.split("/") - if model_type in self.models_dict: - if lang in self.models_dict[model_type]: - if dataset in self.models_dict[model_type][lang]: - if model in self.models_dict[model_type][lang][dataset]: - print(f"> model type : {model_type}") - print(f"> language supported : {lang}") - print(f"> dataset used : {dataset}") - print(f"> model name : {model}") - if "description" in self.models_dict[model_type][lang][dataset][model]: - print( - f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}" - ) - else: - print("> description : coming soon") - if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]: - print( - f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}" - ) - else: - print(f"> model {model} does not exist for {model_type}/{lang}/{dataset}.") - else: - print(f"> dataset {dataset} does not exist for {model_type}/{lang}.") - else: - print(f"> lang {lang} does not exist for {model_type}.") - else: - print(f"> model_type {model_type} does not exist in the list.") + if model_type not in self.models_dict: + logger.error("Model type %s does not exist in the list.", model_type) + return + if lang not in self.models_dict[model_type]: + logger.error("Language %s does not exist for %s.", lang, model_type) + return + if dataset not in self.models_dict[model_type][lang]: + logger.error("Dataset %s does not exist for %s/%s.", dataset, model_type, lang) + return + if model not in self.models_dict[model_type][lang][dataset]: + logger.error("Model %s does not exist for %s/%s/%s.", model, model_type, lang, dataset) + return + self.log_model_details(model_type, lang, dataset, model) def list_tts_models(self): """Print all `TTS` models and return a list of model names @@ -197,18 +188,18 @@ def list_vc_models(self): def list_langs(self): """Print all the available languages""" - print(" Name format: type/language") + logger.info("Name format: type/language") for model_type in self.models_dict: for lang in self.models_dict[model_type]: - print(f" >: {model_type}/{lang} ") + logger.info(" %s/%s", model_type, lang) def list_datasets(self): """Print all the datasets""" - print(" Name format: type/language/dataset") + logger.info("Name format: type/language/dataset") for model_type in self.models_dict: for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: - print(f" >: {model_type}/{lang}/{dataset}") + logger.info(" %s/%s/%s", model_type, lang, dataset) @staticmethod def print_model_license(model_item: Dict): @@ -218,13 +209,13 @@ def print_model_license(model_item: Dict): model_item (dict): model item in the models.json """ if "license" in model_item and model_item["license"].strip() != "": - print(f" > Model's license - {model_item['license']}") + logger.info("Model's license - %s", model_item["license"]) if model_item["license"].lower() in LICENSE_URLS: - print(f" > Check {LICENSE_URLS[model_item['license'].lower()]} for more info.") + logger.info("Check %s for more info.", LICENSE_URLS[model_item["license"].lower()]) else: - print(" > Check https://opensource.org/licenses for more info.") + logger.info("Check https://opensource.org/licenses for more info.") else: - print(" > Model's license - No license information available") + logger.info("Model's license - No license information available") def _download_github_model(self, model_item: Dict, output_path: str): if isinstance(model_item["github_rls_url"], list): @@ -260,8 +251,7 @@ def set_model_url(model_item: Dict): def _set_model_item(self, model_name): # fetch model info from the dict if "fairseq" in model_name: - model_type = "tts_models" - lang = model_name.split("/")[1] + model_type, lang, dataset, model = model_name.split("/") model_item = { "model_type": "tts_models", "license": "CC BY-NC 4.0", @@ -337,7 +327,7 @@ def create_dir_and_download_model(self, model_name, model_item, output_path): if not self.ask_tos(output_path): os.rmdir(output_path) raise Exception(" [!] You must agree to the terms of service to use this model.") - print(f" > Downloading model to {output_path}") + logger.info("Downloading model to %s", output_path) try: if "fairseq" in model_name: self.download_fairseq_model(model_name, output_path) @@ -347,7 +337,7 @@ def create_dir_and_download_model(self, model_name, model_item, output_path): self._download_hf_model(model_item, output_path) except requests.RequestException as e: - print(f" > Failed to download the model file to {output_path}") + logger.exception("Failed to download the model file to %s", output_path) rmtree(output_path) raise e self.print_model_license(model_item=model_item) @@ -365,7 +355,7 @@ def check_if_configs_are_equal(self, model_name, model_item, output_path): config_remote = json.load(f) if not config_local == config_remote: - print(f" > {model_name} is already downloaded however it has been changed. Redownloading it...") + logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) def download_model(self, model_name): @@ -391,12 +381,12 @@ def download_model(self, model_name): if os.path.isfile(md5sum_file): with open(md5sum_file, mode="r") as f: if not f.read() == md5sum: - print(f" > {model_name} has been updated, clearing model cache...") + logger.info("%s has been updated, clearing model cache...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) else: - print(f" > {model_name} is already downloaded.") + logger.info("%s is already downloaded.", model_name) else: - print(f" > {model_name} has been updated, clearing model cache...") + logger.info("%s has been updated, clearing model cache...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) # if the configs are different, redownload it # ToDo: we need a better way to handle it @@ -406,7 +396,7 @@ def download_model(self, model_name): except: pass else: - print(f" > {model_name} is already downloaded.") + logger.info("%s is already downloaded.", model_name) else: self.create_dir_and_download_model(model_name, model_item, output_path) @@ -516,7 +506,7 @@ def _update_path(field_name, new_path, config_path): sub_conf[field_names[-1]] = new_path else: # field name points to a top-level field - if not field_name in config: + if field_name not in config: return if isinstance(config[field_name], list): config[field_name] = [new_path] @@ -545,7 +535,7 @@ def _download_zip_file(file_url, output_folder, progress_bar): z.extractall(output_folder) os.remove(temp_zip_name) # delete zip after extract except zipfile.BadZipFile: - print(f" > Error: Bad zip file - {file_url}") + logger.exception("Bad zip file - %s", file_url) raise zipfile.BadZipFile # pylint: disable=raise-missing-from # move the files to the outer path for file_path in z.namelist(): @@ -581,7 +571,7 @@ def _download_tar_file(file_url, output_folder, progress_bar): tar_names = t.getnames() os.remove(temp_tar_name) # delete tar after extract except tarfile.ReadError: - print(f" > Error: Bad tar file - {file_url}") + logger.exception("Bad tar file - %s", file_url) raise tarfile.ReadError # pylint: disable=raise-missing-from # move the files to the outer path for file_path in os.listdir(os.path.join(output_folder, tar_names[0])): diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index b98647c30c..50a7893047 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -1,3 +1,4 @@ +import logging import os import time from typing import List @@ -21,6 +22,8 @@ from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input +logger = logging.getLogger(__name__) + class Synthesizer(nn.Module): def __init__( @@ -218,7 +221,7 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) - self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) + self.vocoder_ap = AudioProcessor(**self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: @@ -294,9 +297,9 @@ def tts( if text: sens = [text] if split_sentences: - print(" > Text splitted to sentences.") sens = self.split_into_sentences(text) - print(sens) + logger.info("Text split into sentences.") + logger.info("Input: %s", sens) # handle multi-speaker if "voice_dir" in kwargs: @@ -335,7 +338,7 @@ def tts( # handle multi-lingual language_id = None if self.tts_languages_file or ( - hasattr(self.tts_model, "language_manager") + hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None and not self.tts_config.model == "xtts" ): @@ -420,7 +423,7 @@ def tts( self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, ] if scale_factor[1] != 1: - print(" > interpolating tts model output.") + logger.info("Interpolating TTS model output.") vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable @@ -484,7 +487,7 @@ def tts( self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, ] if scale_factor[1] != 1: - print(" > interpolating tts model output.") + logger.info("Interpolating TTS model output.") vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable @@ -500,6 +503,6 @@ def tts( # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] - print(f" > Processing time: {process_time}") - print(f" > Real-time factor: {process_time / audio_time}") + logger.info("Processing time: %.3f", process_time) + logger.info("Real-time factor: %.3f", process_time / audio_time) return wavs diff --git a/TTS/utils/training.py b/TTS/utils/training.py index b51f55e92b..57885005f1 100644 --- a/TTS/utils/training.py +++ b/TTS/utils/training.py @@ -1,6 +1,10 @@ +import logging + import numpy as np import torch +logger = logging.getLogger(__name__) + def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): r"""Check model gradient against unexpected jumps and failures""" @@ -21,11 +25,11 @@ def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): # compatibility with different torch versions if isinstance(grad_norm, float): if np.isinf(grad_norm): - print(" | > Gradient is INF !!") + logger.warning("Gradient is INF !!") skip_flag = True else: if torch.isinf(grad_norm): - print(" | > Gradient is INF !!") + logger.warning("Gradient is INF !!") skip_flag = True return grad_norm, skip_flag diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index aefce2b50b..49c8dc6b66 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,6 +1,10 @@ +import logging + import torch import torchaudio +logger = logging.getLogger(__name__) + def read_audio(path): wav, sr = torchaudio.load(path) @@ -54,8 +58,8 @@ def remove_silence( # read ground truth wav and resample the audio for the VAD try: wav, gt_sample_rate = read_audio(audio_path) - except: - print(f"> ❗ Failed to read {audio_path}") + except Exception: + logger.exception("Failed to read %s", audio_path) return None, False # if needed, resample the audio for the VAD model @@ -80,7 +84,7 @@ def remove_silence( wav = collect_chunks(new_speech_timestamps, wav) is_speech = True else: - print(f"> The file {audio_path} probably does not have speech please check it !!") + logger.warning("The file %s probably does not have speech please check it!", audio_path) is_speech = False # save diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py index 74164a7444..b2fe63d29d 100644 --- a/TTS/vc/configs/shared_configs.py +++ b/TTS/vc/configs/shared_configs.py @@ -1,7 +1,5 @@ -from dataclasses import asdict, dataclass, field -from typing import Dict, List - -from coqpit import Coqpit, check_argument +from dataclasses import dataclass, field +from typing import List from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py index 5a09b4e53e..a498b292b7 100644 --- a/TTS/vc/models/__init__.py +++ b/TTS/vc/models/__init__.py @@ -1,7 +1,10 @@ import importlib +import logging import re from typing import Dict, List, Union +logger = logging.getLogger(__name__) + def to_camel(text): text = text.capitalize() @@ -9,7 +12,7 @@ def to_camel(text): def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC": - print(" > Using model: {}".format(config.model)) + logger.info("Using model: %s", config.model) # fetch the right model implementation. if "model" in config and config["model"].lower() == "freevc": MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 19f2761bbc..22ffd0095c 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -1,6 +1,7 @@ +import logging import os import random -from typing import Dict, List, Tuple, Union +from typing import Any, Optional, Union import torch import torch.distributed as dist @@ -9,6 +10,7 @@ from torch.utils.data import DataLoader from torch.utils.data.sampler import WeightedRandomSampler from trainer.torch import DistributedSampler, DistributedSamplerWrapper +from trainer.trainer import Trainer from TTS.model import BaseTrainerModel from TTS.tts.datasets.dataset import TTSDataset @@ -17,9 +19,12 @@ from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio.processor import AudioProcessor # pylint: skip-file +logger = logging.getLogger(__name__) + class BaseVC(BaseTrainerModel): """Base `vc` class. Every new `vc` model must inherit this. @@ -32,10 +37,10 @@ class BaseVC(BaseTrainerModel): def __init__( self, config: Coqpit, - ap: "AudioProcessor", - speaker_manager: SpeakerManager = None, - language_manager: LanguageManager = None, - ): + ap: AudioProcessor, + speaker_manager: Optional[SpeakerManager] = None, + language_manager: Optional[LanguageManager] = None, + ) -> None: super().__init__() self.config = config self.ap = ap @@ -43,7 +48,7 @@ def __init__( self.language_manager = language_manager self._set_model_args(config) - def _set_model_args(self, config: Coqpit): + def _set_model_args(self, config: Coqpit) -> None: """Setup model args based on the config type (`ModelConfig` or `ModelArgs`). `ModelArgs` has all the fields reuqired to initialize the model architecture. @@ -64,7 +69,7 @@ def _set_model_args(self, config: Coqpit): else: raise ValueError("config must be either a *Config or *Args") - def init_multispeaker(self, config: Coqpit, data: List = None): + def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining `in_channels` size of the connected layers. @@ -93,15 +98,15 @@ def init_multispeaker(self, config: Coqpit, data: List = None): ) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: - print(" > Init speaker_embedding layer.") + logger.info("Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) - def get_aux_input(self, **kwargs) -> Dict: + def get_aux_input(self, **kwargs: Any) -> dict[str, Any]: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} - def get_aux_input_from_test_sentences(self, sentence_info): + def get_aux_input_from_test_sentences(self, sentence_info: Union[str, list[str]]) -> dict[str, Any]: if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -129,7 +134,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): if speaker_name is None: d_vector = self.speaker_manager.get_random_embedding() else: - d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name) + d_vector = self.speaker_manager.get_mean_embedding(speaker_name) elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_id() @@ -148,16 +153,16 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_id": language_id, } - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]: """Generic batch formatting for `VCDataset`. You must override this if you use a custom dataset. Args: - batch (Dict): [description] + batch (dict): [description] Returns: - Dict: [description] + dict: [description] """ # setup input batch text_input = batch["token_id"] @@ -227,18 +232,18 @@ def format_batch(self, batch: Dict) -> Dict: "audio_unique_names": batch["audio_unique_names"], } - def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): + def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus: int = 1): weights = None data_items = dataset.samples if getattr(config, "use_language_weighted_sampler", False): alpha = getattr(config, "language_weighted_sampler_alpha", 1.0) - print(" > Using Language weighted sampler with alpha:", alpha) + logger.info("Using Language weighted sampler with alpha: %.2f", alpha) weights = get_language_balancer_weights(data_items) * alpha if getattr(config, "use_speaker_weighted_sampler", False): alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0) - print(" > Using Speaker weighted sampler with alpha:", alpha) + logger.info("Using Speaker weighted sampler with alpha: %.2f", alpha) if weights is not None: weights += get_speaker_balancer_weights(data_items) * alpha else: @@ -246,7 +251,7 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): if getattr(config, "use_length_weighted_sampler", False): alpha = getattr(config, "length_weighted_sampler_alpha", 1.0) - print(" > Using Length weighted sampler with alpha:", alpha) + logger.info("Using Length weighted sampler with alpha: %.2f", alpha) if weights is not None: weights += get_length_balancer_weights(data_items) * alpha else: @@ -268,12 +273,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: Union[list[dict], list[list]], verbose: bool, num_gpus: int, - rank: int = None, + rank: Optional[int] = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -318,7 +323,6 @@ def get_data_loader( phoneme_cache_path=config.phoneme_cache_path, precompute_num_workers=config.precompute_num_workers, use_noise_augment=False if is_eval else config.use_noise_augment, - verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, tokenizer=None, @@ -350,22 +354,24 @@ def get_data_loader( def _get_test_aux_input( self, - ) -> Dict: + ) -> dict[str, Any]: d_vector = None - if self.config.use_d_vector_file: + if self.speaker_manager is not None and self.config.use_d_vector_file: d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), + "speaker_id": ( + None + if not self.config.use_speaker_embedding + else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1) + ), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } return aux_inputs - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `vc` models used by `Trainer`. You can override this for a different behaviour. @@ -374,9 +380,9 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: assets (dict): A dict of training assets. For `vc` models, it must include `{'audio_processor': ap}`. Returns: - Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. + tuple[dict, dict]: Test figures and audios to be projected to Tensorboard. """ - print(" | > Synthesizing test sentences.") + logger.info("Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -405,7 +411,7 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: ) return test_figures, test_audios - def on_init_start(self, trainer): + def on_init_start(self, trainer: Trainer) -> None: """Save the speaker.pth and language_ids.json at the beginning of the training. Also update both paths.""" if self.speaker_manager is not None: output_path = os.path.join(trainer.output_path, "speakers.pth") @@ -415,8 +421,8 @@ def on_init_start(self, trainer): if hasattr(trainer.config, "model_args"): trainer.config.model_args.speakers_file = output_path trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) - print(f" > `speakers.pth` is saved to {output_path}.") - print(" > `speakers_file` is updated in the config.json.") + logger.info("`speakers.pth` is saved to %s", output_path) + logger.info("`speakers_file` is updated in the config.json.") if self.language_manager is not None: output_path = os.path.join(trainer.output_path, "language_ids.json") @@ -425,5 +431,5 @@ def on_init_start(self, trainer): if hasattr(trainer.config, "model_args"): trainer.config.model_args.language_ids_file = output_path trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) - print(f" > `language_ids.json` is saved to {output_path}.") - print(" > `language_ids_file` is updated in the config.json.") + logger.info("`language_ids.json` is saved to %s", output_path) + logger.info("`language_ids_file` is updated in the config.json.") diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index 8bb9989224..e5cfdc1e61 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -1,3 +1,4 @@ +import logging from typing import Dict, List, Optional, Tuple, Union import librosa @@ -10,17 +11,21 @@ from torch.nn.utils import spectral_norm from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations +from trainer.io import load_fsspec import TTS.vc.modules.freevc.commons as commons import TTS.vc.modules.freevc.modules as modules +from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.io import load_fsspec from TTS.vc.configs.freevc_config import FreeVCConfig from TTS.vc.models.base_vc import BaseVC -from TTS.vc.modules.freevc.commons import get_padding, init_weights +from TTS.vc.modules.freevc.commons import init_weights from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch from TTS.vc.modules.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx from TTS.vc.modules.freevc.wavlm import get_wavlm +from TTS.vocoder.models.hifigan_generator import get_padding + +logger = logging.getLogger(__name__) class ResidualCouplingBlock(nn.Module): @@ -77,7 +82,7 @@ def __init__( self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -152,7 +157,7 @@ def forward(self, x, g=None): return x def remove_weight_norm(self): - print("Removing weight norm...") + logger.info("Removing weight norm...") for l in self.ups: remove_parametrizations(l, "weight") for l in self.resblocks: @@ -164,7 +169,7 @@ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList( [ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), @@ -201,7 +206,7 @@ def forward(self, x): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList( [ norm_f(Conv1d(1, 16, 15, 1, padding=7)), @@ -377,9 +382,9 @@ def device(self): def load_pretrained_speaker_encoder(self): """Load pretrained speaker encoder model as mentioned in the paper.""" - print(" > Loading pretrained speaker encoder model ...") + logger.info("Loading pretrained speaker encoder model ...") self.enc_spk_ex = SpeakerEncoderEx( - "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt" + "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt", device=self.device ) def init_multispeaker(self, config: Coqpit): @@ -468,7 +473,7 @@ def inference(self, c, g=None, mel=None, c_lengths=None): Returns: torch.Tensor: Output tensor. """ - if c_lengths == None: + if c_lengths is None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) if not self.use_spk: g = self.enc_spk.embed_utterance(mel) @@ -544,11 +549,10 @@ def voice_conversion(self, src, tgt): audio = audio[0][0].data.cpu().float().numpy() return audio - def eval_step(): - ... + def eval_step(): ... @staticmethod - def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None): model = FreeVC(config) return model @@ -558,5 +562,4 @@ def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cach if eval: self.eval() - def train_step(): - ... + def train_step(): ... diff --git a/TTS/vc/modules/freevc/commons.py b/TTS/vc/modules/freevc/commons.py index e799cc2a5b..feea7f34dc 100644 --- a/TTS/vc/modules/freevc/commons.py +++ b/TTS/vc/modules/freevc/commons.py @@ -1,27 +1,17 @@ import math -import numpy as np import torch -from torch import nn from torch.nn import functional as F +from TTS.tts.utils.helpers import convert_pad_shape, sequence_mask -def init_weights(m, mean=0.0, std=0.01): + +def init_weights(m: torch.nn.Module, mean: float = 0.0, std: float = 0.01) -> None: classname = m.__class__.__name__ if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - def intersperse(lst, item): result = [item] * (len(lst) * 2 + 1) result[1::2] = lst @@ -121,20 +111,11 @@ def shift_1d(x): return x -def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - def generate_path(duration, mask): """ duration: [b, 1, t_x] mask: [b, 1, t_y, t_x] """ - device = duration.device - b, _, t_y, t_x = mask.shape cum_duration = torch.cumsum(duration, -1) diff --git a/TTS/vc/modules/freevc/mel_processing.py b/TTS/vc/modules/freevc/mel_processing.py index 2dcbf21493..a3e251891a 100644 --- a/TTS/vc/modules/freevc/mel_processing.py +++ b/TTS/vc/modules/freevc/mel_processing.py @@ -1,7 +1,11 @@ +import logging + import torch import torch.utils.data from librosa.filters import mel as librosa_mel_fn +logger = logging.getLogger(__name__) + MAX_WAV_VALUE = 32768.0 @@ -39,9 +43,9 @@ def spectral_de_normalize_torch(magnitudes): def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) + logger.info("Min value is: %.3f", torch.min(y)) if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + logger.info("Max value is: %.3f", torch.max(y)) global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -54,17 +58,19 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -85,9 +91,9 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): if torch.min(y) < -1.0: - print("min value is ", torch.min(y)) + logger.info("Min value is: %.3f", torch.min(y)) if torch.max(y) > 1.0: - print("max value is ", torch.max(y)) + logger.info("Max value is: %.3f", torch.max(y)) global mel_basis, hann_window dtype_device = str(y.dtype) + "_" + str(y.device) @@ -104,17 +110,19 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) diff --git a/TTS/vc/modules/freevc/modules.py b/TTS/vc/modules/freevc/modules.py index 9bb5499003..722444a303 100644 --- a/TTS/vc/modules/freevc/modules.py +++ b/TTS/vc/modules/freevc/modules.py @@ -6,26 +6,13 @@ from torch.nn.utils.parametrize import remove_parametrizations import TTS.vc.modules.freevc.commons as commons -from TTS.vc.modules.freevc.commons import get_padding, init_weights +from TTS.tts.layers.generic.normalization import LayerNorm2 +from TTS.vc.modules.freevc.commons import init_weights +from TTS.vocoder.models.hifigan_generator import get_padding LRELU_SLOPE = 0.1 -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - class ConvReluNorm(nn.Module): def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): super().__init__() @@ -40,11 +27,11 @@ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_la self.conv_layers = nn.ModuleList() self.norm_layers = nn.ModuleList() self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) - self.norm_layers.append(LayerNorm(hidden_channels)) + self.norm_layers.append(LayerNorm2(hidden_channels)) self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) for _ in range(n_layers - 1): self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) - self.norm_layers.append(LayerNorm(hidden_channels)) + self.norm_layers.append(LayerNorm2(hidden_channels)) self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.proj.weight.data.zero_() self.proj.bias.data.zero_() @@ -59,48 +46,6 @@ def forward(self, x, x_mask): return x * x_mask -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size**i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append( - nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding) - ) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - class WN(torch.nn.Module): def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): super(WN, self).__init__() @@ -317,24 +262,6 @@ def forward(self, x, *args, reverse=False, **kwargs): return x -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels, 1)) - self.logs = nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1, 2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - class ResidualCouplingLayer(nn.Module): def __init__( self, diff --git a/TTS/vc/modules/freevc/speaker_encoder/audio.py b/TTS/vc/modules/freevc/speaker_encoder/audio.py index 52f6fd0893..5b23a4dbb6 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/audio.py +++ b/TTS/vc/modules/freevc/speaker_encoder/audio.py @@ -1,13 +1,17 @@ -import struct from pathlib import Path from typing import Optional, Union # import webrtcvad import librosa import numpy as np -from scipy.ndimage.morphology import binary_dilation -from TTS.vc.modules.freevc.speaker_encoder.hparams import * +from TTS.vc.modules.freevc.speaker_encoder.hparams import ( + audio_norm_target_dBFS, + mel_n_channels, + mel_window_length, + mel_window_step, + sampling_rate, +) int16_max = (2**15) - 1 diff --git a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py index 2e21a14fd8..294bf322cb 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py +++ b/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py @@ -1,18 +1,28 @@ -from pathlib import Path +import logging from time import perf_counter as timer from typing import List, Union import numpy as np import torch from torch import nn +from trainer.io import load_fsspec -from TTS.utils.io import load_fsspec from TTS.vc.modules.freevc.speaker_encoder import audio -from TTS.vc.modules.freevc.speaker_encoder.hparams import * +from TTS.vc.modules.freevc.speaker_encoder.hparams import ( + mel_n_channels, + mel_window_step, + model_embedding_size, + model_hidden_size, + model_num_layers, + partials_n_frames, + sampling_rate, +) + +logger = logging.getLogger(__name__) class SpeakerEncoder(nn.Module): - def __init__(self, weights_fpath, device: Union[str, torch.device] = None, verbose=True): + def __init__(self, weights_fpath, device: Union[str, torch.device] = None): """ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). If None, defaults to cuda if it is available on your machine, otherwise the model will @@ -43,9 +53,7 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None, verbo self.load_state_dict(checkpoint["model_state"], strict=False) self.to(device) - - if verbose: - print("Loaded the voice encoder model on %s in %.2f seconds." % (device.type, timer() - start)) + logger.info("Loaded the voice encoder model on %s in %.2f seconds.", device.type, timer() - start) def forward(self, mels: torch.FloatTensor): """ diff --git a/TTS/vc/modules/freevc/wavlm/__init__.py b/TTS/vc/modules/freevc/wavlm/__init__.py index 6edada407b..03b2f5827b 100644 --- a/TTS/vc/modules/freevc/wavlm/__init__.py +++ b/TTS/vc/modules/freevc/wavlm/__init__.py @@ -1,11 +1,14 @@ +import logging import os import urllib.request import torch +from trainer.io import get_user_data_dir -from TTS.utils.generic_utils import get_user_data_dir from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig +logger = logging.getLogger(__name__) + model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt" @@ -20,7 +23,7 @@ def get_wavlm(device="cpu"): output_path = os.path.join(output_path, "WavLM-Large.pt") if not os.path.exists(output_path): - print(f" > Downloading WavLM model to {output_path} ...") + logger.info("Downloading WavLM model to %s ...", output_path) urllib.request.urlretrieve(model_uri, output_path) checkpoint = torch.load(output_path, map_location=torch.device(device)) diff --git a/TTS/vc/modules/freevc/wavlm/config.json b/TTS/vc/modules/freevc/wavlm/config.json index c6f851b93d..c2e414cf0b 100644 --- a/TTS/vc/modules/freevc/wavlm/config.json +++ b/TTS/vc/modules/freevc/wavlm/config.json @@ -96,4 +96,4 @@ "transformers_version": "4.15.0.dev0", "use_weighted_layer_sum": false, "vocab_size": 32 - } \ No newline at end of file + } diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/modules/freevc/wavlm/wavlm.py index fc93bd4f50..10dd09ed0c 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/modules/freevc/wavlm/wavlm.py @@ -155,7 +155,9 @@ def arrange(s, e, length, keep_length): class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + self.extractor_mode: str = ( + "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + ) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -164,7 +166,9 @@ def __init__(self, cfg=None): self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + self.conv_feature_layers: str = ( + "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + ) self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this @@ -387,7 +391,7 @@ def make_conv(): nn.init.kaiming_normal_(conv.weight) return conv - assert (is_layer_norm and is_group_norm) == False, "layer norm and group norm are exclusive" + assert (is_layer_norm and is_group_norm) is False, "layer norm and group norm are exclusive" if is_layer_norm: return nn.Sequential( diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index 871eb0d202..04462817a8 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -10,7 +10,7 @@ from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset: +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List) -> Dataset: if config.model.lower() in "gan": dataset = GANDataset( ap=ap, @@ -24,7 +24,6 @@ def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: return_segments=not is_eval, use_noise_augment=config.use_noise_augment, use_cache=config.use_cache, - verbose=verbose, ) dataset.shuffle_mapping() elif config.model.lower() == "wavegrad": @@ -39,7 +38,6 @@ def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: return_segments=True, use_noise_augment=False, use_cache=config.use_cache, - verbose=verbose, ) elif config.model.lower() == "wavernn": dataset = WaveRNNDataset( @@ -51,7 +49,6 @@ def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: mode=config.model_params.mode, mulaw=config.model_params.mulaw, is_training=not is_eval, - verbose=verbose, ) else: raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.") diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index 50c38c4deb..0806c0d496 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -28,7 +28,6 @@ def __init__( return_segments=True, use_noise_augment=False, use_cache=False, - verbose=False, ): super().__init__() self.ap = ap @@ -43,7 +42,6 @@ def __init__( self.return_segments = return_segments self.use_cache = use_cache self.use_noise_augment = use_noise_augment - self.verbose = verbose assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." self.feat_frame_len = seq_len // hop_len + (2 * conv_pad) @@ -109,7 +107,6 @@ def load_item(self, idx): if self.compute_feat: # compute features from wav wavpath = self.item_list[idx] - # print(wavpath) if self.use_cache and self.cache[idx] is not None: audio, mel = self.cache[idx] diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 305fe430e3..6f34bccb7c 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -28,7 +28,6 @@ def __init__( return_segments=True, use_noise_augment=False, use_cache=False, - verbose=False, ): super().__init__() self.ap = ap @@ -41,7 +40,6 @@ def __init__( self.return_segments = return_segments self.use_cache = use_cache self.use_noise_augment = use_noise_augment - self.verbose = verbose if return_segments: assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index a67c5b31a0..4c4f5c48df 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,9 +1,13 @@ +import logging + import numpy as np import torch from torch.utils.data import Dataset from TTS.utils.audio.numpy_transforms import mulaw_encode, quantize +logger = logging.getLogger(__name__) + class WaveRNNDataset(Dataset): """ @@ -11,9 +15,7 @@ class WaveRNNDataset(Dataset): and converts them to acoustic features on the fly. """ - def __init__( - self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True - ): + def __init__(self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, return_segments=True): super().__init__() self.ap = ap self.compute_feat = not isinstance(items[0], (tuple, list)) @@ -25,7 +27,6 @@ def __init__( self.mode = mode self.mulaw = mulaw self.is_training = is_training - self.verbose = verbose self.return_segments = return_segments assert self.seq_len % self.hop_len == 0 @@ -60,7 +61,7 @@ def load_item(self, index): else: min_audio_len = audio.shape[0] + (2 * self.pad * self.hop_len) if audio.shape[0] < min_audio_len: - print(" [!] Instance is too short! : {}".format(wavpath)) + logger.warning("Instance is too short: %s", wavpath) audio = np.pad(audio, [0, min_audio_len - audio.shape[0] + self.hop_len]) mel = self.ap.melspectrogram(audio) @@ -80,7 +81,7 @@ def load_item(self, index): mel = np.load(feat_path.replace("/quant/", "/mel/")) if mel.shape[-1] < self.mel_len + 2 * self.pad: - print(" [!] Instance is too short! : {}".format(wavpath)) + logger.warning("Instance is too short: %s", wavpath) self.item_list[index] = self.item_list[index + 1] feat_path = self.item_list[index] mel = np.load(feat_path.replace("/quant/", "/mel/")) diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 74cfc7262b..8d4dd725ef 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -221,7 +221,7 @@ class GeneratorLoss(nn.Module): changing configurations. Args: - C (AttrDict): model configuration. + C (Coqpit): model configuration. """ def __init__(self, C): @@ -298,7 +298,7 @@ def forward( adv_loss = adv_loss + self.hinge_gan_loss_weight * hinge_fake_loss # Feature Matching Loss - if self.use_feat_match_loss and not feats_fake is None: + if self.use_feat_match_loss and feats_fake is not None: feat_match_loss = self.feat_match_loss(feats_fake, feats_real) return_dict["G_feat_match_loss"] = feat_match_loss adv_loss = adv_loss + self.feat_match_loss_weight * feat_match_loss diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index 65901617b6..7a1716f16d 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -1,8 +1,11 @@ import importlib +import logging import re from coqpit import Coqpit +logger = logging.getLogger(__name__) + def to_camel(text): text = text.capitalize() @@ -27,13 +30,13 @@ def setup_model(config: Coqpit): MyModel = getattr(MyModel, to_camel(config.model)) except ModuleNotFoundError as e: raise ValueError(f"Model {config.model} not exist!") from e - print(" > Vocoder Model: {}".format(config.model)) + logger.info("Vocoder model: %s", config.model) return MyModel.init_from_config(config) def setup_generator(c): """TODO: use config object as arguments""" - print(" > Generator Model: {}".format(c.generator_model)) + logger.info("Generator model: %s", c.generator_model) MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) # this is to preserve the Wavernn class name (instead of Wavernn) @@ -96,7 +99,7 @@ def setup_generator(c): def setup_discriminator(c): """TODO: use config objekt as arguments""" - print(" > Discriminator Model: {}".format(c.discriminator_model)) + logger.info("Discriminator model: %s", c.discriminator_model) if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") else: diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 19c30e983e..8792950a56 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -7,10 +7,10 @@ from torch import nn from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from trainer.io import load_fsspec from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_fsspec from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss from TTS.vocoder.models import setup_discriminator, setup_generator @@ -349,7 +349,6 @@ def get_data_loader( # pylint: disable=no-self-use, unused-argument return_segments=not is_eval, use_noise_augment=config.use_noise_augment, use_cache=config.use_cache, - verbose=verbose, ) dataset.shuffle_mapping() sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None @@ -369,6 +368,6 @@ def get_criterion(self): return [DiscriminatorLoss(self.config), GeneratorLoss(self.config)] @staticmethod - def init_from_config(config: Coqpit, verbose=True) -> "GAN": - ap = AudioProcessor.init_from_config(config, verbose=verbose) + def init_from_config(config: Coqpit) -> "GAN": + ap = AudioProcessor.init_from_config(config) return GAN(config, ap=ap) diff --git a/TTS/vocoder/models/hifigan_discriminator.py b/TTS/vocoder/models/hifigan_discriminator.py index 7447a5fbc4..1cbc6ab357 100644 --- a/TTS/vocoder/models/hifigan_discriminator.py +++ b/TTS/vocoder/models/hifigan_discriminator.py @@ -3,6 +3,8 @@ from torch import nn from torch.nn import functional as F +from TTS.vocoder.models.hifigan_generator import get_padding + LRELU_SLOPE = 0.1 @@ -29,7 +31,6 @@ class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super().__init__() self.period = period - get_padding = lambda k, d: int((k * d - d) / 2) norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 9247532259..afdd59a859 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -1,18 +1,21 @@ # adopted from https://github.com/jik876/hifi-gan/blob/master/models.py +import logging + import torch from torch import nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations +from trainer.io import load_fsspec -from TTS.utils.io import load_fsspec +logger = logging.getLogger(__name__) LRELU_SLOPE = 0.1 -def get_padding(k, d): - return int((k * d - d) / 2) +def get_padding(kernel_size: int, dilation: int = 1) -> int: + return int((kernel_size * dilation - dilation) / 2) class ResBlock1(torch.nn.Module): @@ -282,7 +285,7 @@ def inference(self, c): return self.forward(c) def remove_weight_norm(self): - print("Removing weight norm...") + logger.info("Removing weight norm...") for l in self.ups: remove_parametrizations(l, "weight") for l in self.resblocks: diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index bb3fee789c..03c971afa4 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -1,8 +1,8 @@ import torch from torch import nn from torch.nn.utils.parametrizations import weight_norm +from trainer.io import load_fsspec -from TTS.utils.io import load_fsspec from TTS.vocoder.layers.melgan import ResidualStack diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index d02af75f05..211d45d91c 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -1,3 +1,4 @@ +import logging import math import torch @@ -6,6 +7,8 @@ from TTS.vocoder.layers.parallel_wavegan import ResidualBlock +logger = logging.getLogger(__name__) + class ParallelWaveganDiscriminator(nn.Module): """PWGAN discriminator as in https://arxiv.org/abs/1910.11480. @@ -76,7 +79,7 @@ def _apply_weight_norm(m): def remove_weight_norm(self): def _remove_weight_norm(m): try: - # print(f"Weight norm is removed from {m}.") + logger.info("Weight norm is removed from %s", m) remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -179,7 +182,7 @@ def _apply_weight_norm(m): def remove_weight_norm(self): def _remove_weight_norm(m): try: - print(f"Weight norm is removed from {m}.") + logger.info("Weight norm is removed from %s", m) remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index 8338d94653..6a4d4ca6e7 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -1,13 +1,16 @@ +import logging import math import numpy as np import torch from torch.nn.utils.parametrize import remove_parametrizations +from trainer.io import load_fsspec -from TTS.utils.io import load_fsspec from TTS.vocoder.layers.parallel_wavegan import ResidualBlock from TTS.vocoder.layers.upsample import ConvUpsample +logger = logging.getLogger(__name__) + class ParallelWaveganGenerator(torch.nn.Module): """PWGAN generator as in https://arxiv.org/pdf/1910.11480.pdf. @@ -126,7 +129,7 @@ def inference(self, c): def remove_weight_norm(self): def _remove_weight_norm(m): try: - # print(f"Weight norm is removed from {m}.") + logger.info("Weight norm is removed from %s", m) remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -137,7 +140,7 @@ def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): torch.nn.utils.parametrizations.weight_norm(m) - # print(f"Weight norm is applied to {m}.") + logger.info("Weight norm is applied to %s", m) self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 5e66b70df8..72e57a9c39 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -1,3 +1,4 @@ +import logging from typing import List import numpy as np @@ -7,6 +8,8 @@ from TTS.vocoder.layers.lvc_block import LVCBlock +logger = logging.getLogger(__name__) + LRELU_SLOPE = 0.1 @@ -113,7 +116,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: - # print(f"Weight norm is removed from {m}.") + logger.info("Weight norm is removed from %s", m) parametrize.remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -126,7 +129,7 @@ def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): torch.nn.utils.parametrizations.weight_norm(m) - # print(f"Weight norm is applied to {m}.") + logger.info("Weight norm is applied to %s", m) self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index c1166e0914..c49abd2201 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -9,9 +9,9 @@ from torch.nn.utils.parametrize import remove_parametrizations from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from trainer.io import load_fsspec from trainer.trainer_utils import get_optimizer, get_scheduler -from TTS.utils.io import load_fsspec from TTS.vocoder.datasets import WaveGradDataset from TTS.vocoder.layers.wavegrad import Conv1d, DBlock, FiLM, UBlock from TTS.vocoder.models.base_vocoder import BaseVocoder @@ -321,7 +321,6 @@ def get_data_loader(self, config: Coqpit, assets: Dict, is_eval: True, samples: return_segments=True, use_noise_augment=False, use_cache=config.use_cache, - verbose=verbose, ) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 7f74ba3ebf..723f18dde2 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -10,11 +10,11 @@ from torch import nn from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from trainer.io import load_fsspec from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import mulaw_decode -from TTS.utils.io import load_fsspec from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.layers.losses import WaveRNNLoss from TTS.vocoder.models.base_vocoder import BaseVocoder @@ -91,7 +91,7 @@ def __init__( use_aux_net, ): super().__init__() - self.total_scale = np.cumproduct(upsample_scales)[-1] + self.total_scale = np.cumprod(upsample_scales)[-1] self.indent = pad * self.total_scale self.use_aux_net = use_aux_net if use_aux_net: @@ -239,7 +239,7 @@ class of models has however remained an elusive problem. With a focus on text-to if self.args.use_upsample_net: assert ( - np.cumproduct(self.args.upsample_factors)[-1] == config.audio.hop_length + np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length ), " [!] upsample scales needs to be equal to hop_length" self.upsample = UpsampleNetwork( self.args.feat_dims, @@ -623,7 +623,6 @@ def get_data_loader( # pylint: disable=no-self-use mode=config.model_args.mode, mulaw=config.model_args.mulaw, is_training=not is_eval, - verbose=verbose, ) sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None loader = DataLoader( diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 63a0af4445..ac797d97f7 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,3 +1,4 @@ +import logging from typing import Dict import numpy as np @@ -7,6 +8,8 @@ from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor +logger = logging.getLogger(__name__) + def interpolate_vocoder_input(scale_factor, spec): """Interpolate spectrogram by the scale factor. @@ -20,12 +23,12 @@ def interpolate_vocoder_input(scale_factor, spec): Returns: torch.tensor: interpolated spectrogram. """ - print(" > before interpolation :", spec.shape) + logger.info("Before interpolation: %s", spec.shape) spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) # pylint: disable=not-callable spec = torch.nn.functional.interpolate( spec, scale_factor=scale_factor, recompute_scale_factor=True, mode="bilinear", align_corners=False ).squeeze(0) - print(" > after interpolation :", spec.shape) + logger.info("After interpolation: %s", spec.shape) return spec @@ -40,7 +43,7 @@ def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_ Returns: Dict: output figures keyed by the name of the figures. - """ """Plot vocoder model results""" + """ if name_prefix is None: name_prefix = "" diff --git a/dockerfiles/Dockerfile.dev b/dockerfiles/Dockerfile.dev index 58baee53e2..af0d3fc0cd 100644 --- a/dockerfiles/Dockerfile.dev +++ b/dockerfiles/Dockerfile.dev @@ -11,34 +11,13 @@ RUN apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # Install Major Python Dependencies: +RUN pip3 install -U pip setuptools RUN pip3 install llvmlite --ignore-installed RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 RUN rm -rf /root/.cache/pip -WORKDIR /root - -# Copy Dependency Lock Files: -COPY \ - Makefile \ - pyproject.toml \ - setup.py \ - requirements.dev.txt \ - requirements.ja.txt \ - requirements.notebooks.txt \ - requirements.txt \ - /root/ - -# Install Project Dependencies -# Separate stage to limit re-downloading: -RUN pip install \ - -r requirements.txt \ - -r requirements.dev.txt \ - -r requirements.ja.txt \ - -r requirements.notebooks.txt - # Copy TTS repository contents: +WORKDIR /root COPY . /root -# Installing the TTS package itself: RUN make install - diff --git a/docs/requirements.txt b/docs/requirements.txt index efbefec44b..86ccae9cca 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,4 @@ myst-parser == 2.0.0 sphinx == 7.2.5 sphinx_inline_tabs sphinx_copybutton -linkify-it-py \ No newline at end of file +linkify-it-py diff --git a/docs/source/conf.py b/docs/source/conf.py index b85324fd40..e7d36c1f43 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,26 +10,24 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +import importlib.metadata import os import sys -sys.path.insert(0, os.path.abspath('../..')) +sys.path.insert(0, os.path.abspath("../..")) # mock deps with system level requirements. autodoc_mock_imports = ["soundfile"] # -- Project information ----------------------------------------------------- -project = 'TTS' +project = "coqui-tts" copyright = "2021 Coqui GmbH, 2020 TTS authors" -author = 'Coqui GmbH' - -with open("../../TTS/VERSION", "r") as ver: - version = ver.read().strip() +author = "Coqui GmbH" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -release = version +release = importlib.metadata.version(project) # The main toctree document. master_doc = "index" @@ -40,32 +38,34 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', - 'sphinx.ext.autosectionlabel', - 'myst_parser', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.autosectionlabel", + "myst_parser", "sphinx_copybutton", "sphinx_inline_tabs", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'TODO/*'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "TODO/*"] source_suffix = [".rst", ".md"] -myst_enable_extensions = ['linkify',] +myst_enable_extensions = [ + "linkify", +] # 'sphinxcontrib.katex', # 'sphinx.ext.autosectionlabel', @@ -76,17 +76,17 @@ # duplicated section names that are in different documents. autosectionlabel_prefix_document = True -language = 'en' +language = "en" autodoc_inherit_docstrings = False # Disable displaying type annotations, these can be very verbose -autodoc_typehints = 'none' +autodoc_typehints = "none" # Enable overriding of function signatures in the first line of the docstring. autodoc_docstring_signature = True -napoleon_custom_sections = [('Shapes', 'shape')] +napoleon_custom_sections = [("Shapes", "shape")] # -- Options for HTML output ------------------------------------------------- @@ -94,7 +94,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'furo' +html_theme = "furo" html_tite = "TTS" html_theme_options = { "light_logo": "logo.png", @@ -103,18 +103,18 @@ } html_sidebars = { - '**': [ - "sidebar/scroll-start.html", - "sidebar/brand.html", - "sidebar/search.html", - "sidebar/navigation.html", - "sidebar/ethical-ads.html", - "sidebar/scroll-end.html", - ] - } + "**": [ + "sidebar/scroll-start.html", + "sidebar/brand.html", + "sidebar/search.html", + "sidebar/navigation.html", + "sidebar/ethical-ads.html", + "sidebar/scroll-end.html", + ] +} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md index d08a55837d..58d961203e 100644 --- a/docs/source/docker_images.md +++ b/docs/source/docker_images.md @@ -32,7 +32,7 @@ For the GPU version, you need to have the latest NVIDIA drivers installed. With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8 ```bash -docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true +docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda ``` ## Start a server @@ -50,7 +50,7 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits ```bash docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts python3 TTS/server/server.py --list_models #To get the list of available models -python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true +python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda ``` -Click [there](http://[::1]:5002/) and have fun with the server! \ No newline at end of file +Click [there](http://[::1]:5002/) and have fun with the server! diff --git a/docs/source/faq.md b/docs/source/faq.md index fa48c4a9fb..1090aaa35c 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -3,7 +3,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is ## Errors with a pre-trained model. How can I resolve this? - Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table. -- If it is still problematic, post your problem on [Discussions](https://github.com/coqui-ai/TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) +- If it is still problematic, post your problem on [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) - If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny. ## What are the requirements of a good 🐸TTS dataset? @@ -16,7 +16,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is - If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments. ## How can I train my own `tts` model? -0. Check your dataset with notebooks in [dataset_analysis](https://github.com/coqui-ai/TTS/tree/master/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/coqui-ai/TTS/blob/master/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. +0. Check your dataset with notebooks in [dataset_analysis](https://github.com/idiap/coqui-ai-TTS/tree/main/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/idiap/coqui-ai-TTS/blob/main/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. 1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech. A `formatter` parses the metadata file and converts a list of training samples. diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md index 069f565137..548e385ec7 100644 --- a/docs/source/finetuning.md +++ b/docs/source/finetuning.md @@ -111,4 +111,3 @@ them and fine-tune it for your own dataset. This will help you in two main ways: --coqpit.run_name "glow-tts-finetune" \ --coqpit.lr 0.00001 ``` - diff --git a/docs/source/inference.md b/docs/source/inference.md index 56bccfb5b2..4cb8f45a71 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -4,7 +4,7 @@ First, you need to install TTS. We recommend using PyPi. You need to call the command below: ```bash -$ pip install TTS +$ pip install coqui-tts ``` After the installation, 2 terminal commands are available. @@ -14,7 +14,7 @@ After the installation, 2 terminal commands are available. 3. In 🐍Python. - `from TTS.api import TTS` ## On the Commandline - `tts` -![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) +![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif) After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. @@ -81,11 +81,13 @@ tts --model_name "voice_conversion///" ## On the Demo Server - `tts-server` - -![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) + +![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif) -You can boot up a demo 🐸TTS server to run an inference with your models. Note that the server is not optimized for performance -but gives you an easy way to interact with the models. +You can boot up a demo 🐸TTS server to run an inference with your models (make +sure to install the additional dependencies with `pip install coqui-tts[server]`). +Note that the server is not optimized for performance but gives you an easy way +to interact with the models. The demo server provides pretty much the same interface as the CLI command. diff --git a/docs/source/installation.md b/docs/source/installation.md index c4d05361f4..405c436643 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,6 +1,6 @@ # Installation -🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10. +🐸TTS supports python >=3.9 <3.13.0 and was tested on Ubuntu 22.04. ## Using `pip` @@ -9,13 +9,13 @@ You can install from PyPI as follows: ```bash -pip install TTS # from PyPI +pip install coqui-tts # from PyPI ``` Or install from Github: ```bash -pip install git+https://github.com/coqui-ai/TTS # from Github +pip install git+https://github.com/idiap/coqui-ai-TTS # from Github ``` ## Installing From Source @@ -23,11 +23,18 @@ pip install git+https://github.com/coqui-ai/TTS # from Github This is recommended for development and more control over 🐸TTS. ```bash -git clone https://github.com/coqui-ai/TTS/ -cd TTS +git clone https://github.com/idiap/coqui-ai-TTS +cd coqui-ai-TTS make system-deps # only on Linux systems. + +# Install package and optional extras make install + +# Same as above + dev dependencies and pre-commit +make install_dev ``` ## On Windows -If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/ \ No newline at end of file +If you are on Windows, 👑@GuyPaddock wrote installation instructions +[here](https://stackoverflow.com/questions/66726331/) (note that these are out +of date, e.g. you need to have at least Python 3.9) diff --git a/docs/source/main_classes/audio_processor.md b/docs/source/main_classes/audio_processor.md index 600b0db582..98e94a8789 100644 --- a/docs/source/main_classes/audio_processor.md +++ b/docs/source/main_classes/audio_processor.md @@ -22,4 +22,4 @@ also must inherit or initiate `BaseAudioConfig`. ```{eval-rst} .. autoclass:: TTS.config.shared_configs.BaseAudioConfig :members: -``` \ No newline at end of file +``` diff --git a/docs/source/main_classes/dataset.md b/docs/source/main_classes/dataset.md index 92d381aca5..1566488194 100644 --- a/docs/source/main_classes/dataset.md +++ b/docs/source/main_classes/dataset.md @@ -22,4 +22,4 @@ ```{eval-rst} .. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset :members: -``` \ No newline at end of file +``` diff --git a/docs/source/main_classes/gan.md b/docs/source/main_classes/gan.md index 4524b4b5c5..e143f6431e 100644 --- a/docs/source/main_classes/gan.md +++ b/docs/source/main_classes/gan.md @@ -9,4 +9,4 @@ to do its ✨ī¸. ```{eval-rst} .. autoclass:: TTS.vocoder.models.gan.GAN :members: -``` \ No newline at end of file +``` diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md index 0e6f2d9427..71b3d41640 100644 --- a/docs/source/main_classes/model_api.md +++ b/docs/source/main_classes/model_api.md @@ -21,4 +21,4 @@ Model API provides you a set of functions that easily make your model compatible ```{eval-rst} .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder :members: -``` \ No newline at end of file +``` diff --git a/docs/source/main_classes/speaker_manager.md b/docs/source/main_classes/speaker_manager.md index ba4b55dc78..fe98823956 100644 --- a/docs/source/main_classes/speaker_manager.md +++ b/docs/source/main_classes/speaker_manager.md @@ -8,4 +8,4 @@ especially useful for multi-speaker models. ```{eval-rst} .. automodule:: TTS.tts.utils.speakers :members: -``` \ No newline at end of file +``` diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md index 876e09e5b6..335294aa4d 100644 --- a/docs/source/main_classes/trainer_api.md +++ b/docs/source/main_classes/trainer_api.md @@ -1,3 +1,3 @@ # Trainer API -We made the trainer a separate project on https://github.com/coqui-ai/Trainer +We made the trainer a separate project on https://github.com/eginhard/coqui-trainer diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md index c328ae6110..a180afbb91 100644 --- a/docs/source/models/bark.md +++ b/docs/source/models/bark.md @@ -69,14 +69,12 @@ tts --model_name tts_models/multilingual/multi-dataset/bark \ --text "This is an example." \ --out_path "output.wav" \ --voice_dir bark_voices/ \ ---speaker_idx "ljspeech" \ ---progress_bar True +--speaker_idx "ljspeech" # Random voice generation tts --model_name tts_models/multilingual/multi-dataset/bark \ --text "This is an example." \ ---out_path "output.wav" \ ---progress_bar True +--out_path "output.wav" ``` diff --git a/docs/source/models/forward_tts.md b/docs/source/models/forward_tts.md index f8f941c2fd..d618e4e056 100644 --- a/docs/source/models/forward_tts.md +++ b/docs/source/models/forward_tts.md @@ -61,5 +61,3 @@ Currently we provide the following pre-configured architectures: .. autoclass:: TTS.tts.configs.fast_speech_config.FastSpeechConfig :members: ``` - - diff --git a/docs/source/models/overflow.md b/docs/source/models/overflow.md index 09e270eae5..042ad47474 100644 --- a/docs/source/models/overflow.md +++ b/docs/source/models/overflow.md @@ -33,4 +33,4 @@ are available at https://shivammehta25.github.io/OverFlow/. ```{eval-rst} .. autoclass:: TTS.tts.models.overflow.Overflow :members: -``` \ No newline at end of file +``` diff --git a/docs/source/models/tacotron1-2.md b/docs/source/models/tacotron1-2.md index 25721eba4c..285d4f3c55 100644 --- a/docs/source/models/tacotron1-2.md +++ b/docs/source/models/tacotron1-2.md @@ -20,8 +20,8 @@ If you have a limited VRAM, then you can try using the Guided Attention Loss or ## Important resources & papers -- Tacotron: https://arxiv.org/abs/2006.06873 -- Tacotron2: https://arxiv.org/abs/2008.03802 +- Tacotron: [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135) +- Tacotron2: [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884) - Double Decoder Consistency: https://coqui.ai/blog/tts/solving-attention-problems-of-tts-models-with-double-decoder-consistency - Guided Attention Loss: https://arxiv.org/abs/1710.08969 - Forward & Backward Decoder: https://arxiv.org/abs/1907.09006 @@ -59,5 +59,3 @@ If you have a limited VRAM, then you can try using the Guided Attention Loss or .. autoclass:: TTS.tts.configs.tacotron2_config.Tacotron2Config :members: ``` - - diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md index 1a8e9ca8e9..30afd1355b 100644 --- a/docs/source/models/tortoise.md +++ b/docs/source/models/tortoise.md @@ -57,14 +57,12 @@ tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ --text "This is an example." \ --out_path "output.wav" \ --voice_dir path/to/tortoise/voices/dir/ \ ---speaker_idx "lj" \ ---progress_bar True +--speaker_idx "lj" # Random voice generation tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ --text "This is an example." \ ---out_path "output.wav" \ ---progress_bar True +--out_path "output.wav" ``` diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index b979d04f6e..c07d879f7c 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -3,9 +3,6 @@ ⓍTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy. There is no need for an excessive amount of training data that spans countless hours. -This is the same model that powers [Coqui Studio](https://coqui.ai/), and [Coqui API](https://docs.coqui.ai/docs), however we apply -a few tricks to make it faster and support streaming inference. - ### Features - Voice cloning. - Cross-language voice cloning. @@ -17,36 +14,50 @@ a few tricks to make it faster and support streaming inference. ### Updates with v2 - Improved voice cloning. - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime. -- 2 new languages: Hungarian and Korean. - Across the board quality improvements. ### Code Current implementation only supports inference and GPT encoder training. ### Languages -As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko). - -Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out. +XTTS-v2 supports 17 languages: + +- Arabic (ar) +- Chinese (zh-cn) +- Czech (cs) +- Dutch (nl) +- English (en) +- French (fr) +- German (de) +- Hindi (hi) +- Hungarian (hu) +- Italian (it) +- Japanese (ja) +- Korean (ko) +- Polish (pl) +- Portuguese (pt) +- Russian (ru) +- Spanish (es) +- Turkish (tr) ### License This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml). ### Contact -Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Twitter](https://twitter.com/coqui_ai). -You can also mail us at info@coqui.ai. +Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Github](https://github.com/idiap/coqui-ai-TTS/discussions). ### Inference #### 🐸TTS Command line -You can check all supported languages with the following command: +You can check all supported languages with the following command: ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ --list_language_idx ``` -You can check all Coqui available speakers with the following command: +You can check all Coqui available speakers with the following command: ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ @@ -61,7 +72,7 @@ You can do inference using one of the available speakers using the following com --text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \ --speaker_idx "Ana Florence" \ --language_idx en \ - --use_cuda true + --use_cuda ``` ##### Clone a voice @@ -74,7 +85,7 @@ You can clone a speaker voice using a single or multiple references: --text "BugÃŧn okula gitmek istemiyorum." \ --speaker_wav /path/to/target/speaker.wav \ --language_idx tr \ - --use_cuda true + --use_cuda ``` ###### Multiple references @@ -83,7 +94,7 @@ You can clone a speaker voice using a single or multiple references: --text "BugÃŧn okula gitmek istemiyorum." \ --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \ --language_idx tr \ - --use_cuda true + --use_cuda ``` or for all wav files in a directory you can use: @@ -92,7 +103,7 @@ or for all wav files in a directory you can use: --text "BugÃŧn okula gitmek istemiyorum." \ --speaker_wav /path/to/target/*.wav \ --language_idx tr \ - --use_cuda true + --use_cuda ``` #### 🐸TTS API @@ -280,7 +291,7 @@ To make the `XTTS_v2` fine-tuning more accessible for users that do not have goo The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). -To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). +To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](https://www.youtube.com/watch?v=8tpDiiouGxc). If you are not able to acess the video you need to follow the steps: @@ -294,7 +305,7 @@ If you are not able to acess the video you need to follow the steps: ##### Run demo locally To run the demo locally you need to do the following steps: -1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). +1. Install 🐸 TTS following the instructions available [here](https://coqui-tts.readthedocs.io/en/latest/installation.html). 2. Install the Gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` 3. Run the Gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` 4. Follow the steps presented in the [tutorial video](https://www.youtube.com/watch?v=8tpDiiouGxc&feature=youtu.be) to be able to fine-tune and test the fine-tuned model. diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md index acde3fc4c2..b417c4c45a 100644 --- a/docs/source/tutorial_for_nervous_beginners.md +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -5,14 +5,14 @@ User friendly installation. Recommended only for synthesizing voice. ```bash -$ pip install TTS +$ pip install coqui-tts ``` Developer friendly installation. ```bash -$ git clone https://github.com/coqui-ai/TTS -$ cd TTS +$ git clone https://github.com/idiap/coqui-ai-TTS +$ cd coqui-ai-TTS $ pip install -e . ``` @@ -109,14 +109,15 @@ $ tts -h # see the help $ tts --list_models # list the available models. ``` -![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) +![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif) -You can call `tts-server` to start a local demo server that you can open it on -your favorite web browser and đŸ—Ŗī¸. +You can call `tts-server` to start a local demo server that you can open on +your favorite web browser and đŸ—Ŗī¸ (make sure to install the additional +dependencies with `pip install coqui-tts[server]`). ```bash $ tts-server -h # see the help $ tts-server --list_models # list the available models. ``` -![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) +![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif) diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/what_makes_a_good_dataset.md index 18c87453f7..44a93a39da 100644 --- a/docs/source/what_makes_a_good_dataset.md +++ b/docs/source/what_makes_a_good_dataset.md @@ -17,4 +17,4 @@ If you like to use a bespoken dataset, you might like to perform a couple of qua * **CheckSpectrograms** is to measure the noise level of the clips and find good audio processing parameters. The noise level might be observed by checking spectrograms. If spectrograms look cluttered, especially in silent parts, this dataset might not be a good candidate for a TTS project. If your voice clips are too noisy in the background, it makes things harder for your model to learn the alignment, and the final result might be different than the voice you are given. If the spectrograms look good, then the next step is to find a good set of audio processing parameters, defined in ```config.json```. In the notebook, you can compare different sets of parameters and see the resynthesis results in relation to the given ground-truth. Find the best parameters that give the best possible synthesis performance. -Another practical detail is the quantization level of the clips. If your dataset has a very high bit-rate, that might cause slow data-load time and consequently slow training. It is better to reduce the sample-rate of your dataset to around 16000-22050. \ No newline at end of file +Another practical detail is the quantization level of the clips. If your dataset has a very high bit-rate, that might cause slow data-load time and consequently slow training. It is better to reduce the sample-rate of your dataset to around 16000-22050. diff --git a/hubconf.py b/hubconf.py index 0c9c5930fc..6e10928265 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,15 +1,11 @@ -dependencies = [ - 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite' -] +dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"] import torch from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer -def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', - vocoder_name=None, - use_cuda=False): +def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, use_cuda=False): """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. Example: @@ -28,19 +24,20 @@ def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', manager = ModelManager() model_path, config_path, model_item = manager.download_model(model_name) - vocoder_name = model_item[ - 'default_vocoder'] if vocoder_name is None else vocoder_name + vocoder_name = model_item["default_vocoder"] if vocoder_name is None else vocoder_name vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) # create synthesizer - synt = Synthesizer(tts_checkpoint=model_path, - tts_config_path=config_path, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config_path, - use_cuda=use_cuda) + synt = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + use_cuda=use_cuda, + ) return synt -if __name__ == '__main__': - synthesizer = torch.hub.load('coqui-ai/TTS:dev', 'tts', source='github') +if __name__ == "__main__": + synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github") synthesizer.tts("This is a test!") diff --git a/images/TTS-performance.png b/images/TTS-performance.png deleted file mode 100644 index 68eebaf7e6..0000000000 Binary files a/images/TTS-performance.png and /dev/null differ diff --git a/images/tts_performance.png b/images/tts_performance.png deleted file mode 100644 index bdff06731e..0000000000 Binary files a/images/tts_performance.png and /dev/null differ diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 65edf98ca4..d85ca1035a 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -185,4 +185,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/notebooks/Tutorial_1_use-pretrained-TTS.ipynb b/notebooks/Tutorial_1_use-pretrained-TTS.ipynb index 87d04c499d..3c2e9de924 100644 --- a/notebooks/Tutorial_1_use-pretrained-TTS.ipynb +++ b/notebooks/Tutorial_1_use-pretrained-TTS.ipynb @@ -41,7 +41,7 @@ "outputs": [], "source": [ "! pip install -U pip\n", - "! pip install TTS" + "! pip install coqui-tts" ] }, { diff --git a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb index 0f580a85b6..c4186670c9 100644 --- a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb +++ b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb @@ -32,7 +32,7 @@ "source": [ "## Install Coqui TTS\n", "! pip install -U pip\n", - "! pip install TTS" + "! pip install coqui-tts" ] }, { @@ -44,7 +44,7 @@ "\n", "### **First things first**: we need some data.\n", "\n", - "We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise and vocabulary coverage can be found in the [🐸TTS documentation](https://tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n", + "We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise and vocabulary coverage can be found in the [🐸TTS documentation](https://coqui-tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n", "\n", "If you have a single audio file and you need to **split** it into clips. It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using **wav** file format.\n", "\n", diff --git a/notebooks/dataset_analysis/CheckPitch.ipynb b/notebooks/dataset_analysis/CheckPitch.ipynb index 72afbc64a1..ebdac87378 100644 --- a/notebooks/dataset_analysis/CheckPitch.ipynb +++ b/notebooks/dataset_analysis/CheckPitch.ipynb @@ -176,4 +176,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/notebooks/dataset_analysis/README.md b/notebooks/dataset_analysis/README.md index 79faf52159..9fe40d01a4 100644 --- a/notebooks/dataset_analysis/README.md +++ b/notebooks/dataset_analysis/README.md @@ -2,6 +2,6 @@ By the use of this notebook, you can easily analyze a brand new dataset, find exceptional cases and define your training set. -What we are looking in here is reasonable distribution of instances in terms of sequence-length, audio-length and word-coverage. +What we are looking in here is reasonable distribution of instances in terms of sequence-length, audio-length and word-coverage. This notebook is inspired from https://github.com/MycroftAI/mimic2 diff --git a/pyproject.toml b/pyproject.toml index 922575305c..94ed3a2c36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,207 @@ [build-system] requires = [ "setuptools", - "wheel", + "setuptools-scm", "cython~=0.29.30", - "numpy>=1.22.0", - "packaging", + "numpy>=2.0.0", ] +build-backend = "setuptools.build_meta" -[flake8] -max-line-length=120 +[tool.setuptools.packages.find] +include = ["TTS*"] + +[project] +name = "coqui-tts" +version = "0.24.1" +description = "Deep learning for Text to Speech." +readme = "README.md" +requires-python = ">=3.9, <3.13" +license = {text = "MPL-2.0"} +authors = [ + {name = "Eren GÃļlge", email = "egolge@coqui.ai"} +] +maintainers = [ + {name = "Enno Hermann", email = "enno.hermann@gmail.com"} +] +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Operating System :: POSIX :: Linux", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Multimedia :: Sound/Audio :: Speech", + "Topic :: Multimedia :: Sound/Audio", + "Topic :: Multimedia", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dependencies = [ + # Core + "numpy>=1.24.3,<2.0.0", # TODO: remove upper bound after spacy/thinc release + "cython>=0.29.30", + "scipy>=1.11.2", + "torch>=2.1", + "torchaudio", + "soundfile>=0.12.0", + "librosa>=0.10.1", + "inflect>=5.6.0", + "tqdm>=4.64.1", + "anyascii>=0.3.0", + "pyyaml>=6.0", + "fsspec[http]>=2023.6.0", + "packaging>=23.1", + # Inference + "pysbd>=0.3.4", + # Training + "matplotlib>=3.7.0", + # Coqui stack + "coqui-tts-trainer>=0.1.4", + "coqpit>=0.0.16", + # Gruut + supported languages + "gruut[de,es,fr]>=2.4.0", + # Tortoise + "einops>=0.6.0", + "transformers>=4.42.0,<4.43.0", + # Bark + "encodec>=0.1.1", + # XTTS + "num2words>=0.5.11", + "spacy[ja]>=3" +] + +[project.optional-dependencies] +# Development dependencies +dev = [ + "black==24.2.0", + "coverage[toml]>=7", + "nose2>=0.15", + "pre-commit>=3", + "ruff==0.4.9", + "tomli>=2; python_version < '3.11'", +] +# Dependencies for building the documentation +docs = [ + "furo>=2023.5.20", + "myst-parser==2.0.0", + "sphinx==7.2.5", + "sphinx_inline_tabs>=2023.4.21", + "sphinx_copybutton>=0.1", + "linkify-it-py>=2.0.0", +] +# Only used in notebooks +notebooks = [ + "bokeh==1.4.0", + "pandas>=1.4,<2.0", + "umap-learn>=0.5.1", +] +# For running the TTS server +server = ["flask>=3.0.0"] +# Language-specific dependencies, mainly for G2P +# Bangla +bn = [ + "bangla>=0.0.2", + "bnnumerizer>=0.0.2", + "bnunicodenormalizer>=0.1.0", +] +# Korean +ko = [ + "hangul_romanize>=0.1.0", + "jamo>=0.4.1", + "g2pkk>=0.1.1", +] +# Japanese +ja = [ + "mecab-python3>=1.0.2", + "unidic-lite==1.0.8", + "cutlet>=0.2.0", +] +# Chinese +zh = [ + "jieba>=0.42.1", + "pypinyin>=0.40.0", +] +# All language-specific dependencies +languages = [ + "coqui-tts[bn,ja,ko,zh]", +] +# Installs all extras (except dev and docs) +all = [ + "coqui-tts[notebooks,server,bn,ja,ko,zh]", +] + +[project.urls] +Homepage = "https://github.com/idiap/coqui-ai-TTS" +Documentation = "https://coqui-tts.readthedocs.io" +Repository = "https://github.com/idiap/coqui-ai-TTS" +Issues = "https://github.com/idiap/coqui-ai-TTS/issues" +Discussions = "https://github.com/idiap/coqui-ai-TTS/discussions" + +[project.scripts] +tts = "TTS.bin.synthesize:main" +tts-server = "TTS.server.server:main" + +[tool.ruff] +target-version = "py39" +line-length = 120 +lint.extend-select = [ + "B033", # duplicate-value + "C416", # unnecessary-comprehension + "D419", # empty-docstring + "E999", # syntax-error + "F401", # unused-import + "F704", # yield-outside-function + "F706", # return-outside-function + "F841", # unused-variable + "I", # import sorting + "PIE790", # unnecessary-pass + "PLC", + "PLE", + "PLR0124", # comparison-with-itself + "PLR0206", # property-with-parameters + "PLR0911", # too-many-return-statements + "PLR1711", # useless-return + "PLW", + "W291", # trailing-whitespace + "NPY201", # NumPy 2.0 deprecation +] + +lint.ignore = [ + "E722", # bare except (TODO: fix these) + "E731", # don't use lambdas + "E741", # ambiguous variable name + "F821", # TODO: enable + "F841", # TODO: enable + "PLW0602", # TODO: enable + "PLW2901", # TODO: enable + "PLW0127", # TODO: enable + "PLW0603", # TODO: enable +] + +[tool.ruff.lint.pylint] +max-args = 5 +max-public-methods = 20 +max-returns = 7 + +[tool.ruff.lint.per-file-ignores] +"**/__init__.py" = [ + "F401", # init files may have "unused" imports for now + "F403", # init files may have star imports for now +] +"hubconf.py" = [ + "E402", # module level import not at top of file +] [tool.black] line-length = 120 target-version = ['py39'] -[tool.isort] -line_length = 120 -profile = "black" -multi_line_output = 3 +[tool.coverage.run] +parallel = true +source = ["TTS"] diff --git a/recipes/README.md b/recipes/README.md index 21a6727d8b..fcc4719aaa 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -19,4 +19,4 @@ python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed - If you train a new model using TTS, feel free to share your training to expand the list of recipes. -You can also open a new discussion and share your progress with the 🐸 community. \ No newline at end of file +You can also open a new discussion and share your progress with the 🐸 community. diff --git a/recipes/bel-alex73/README.md b/recipes/bel-alex73/README.md index ad378dd998..6075d3102d 100644 --- a/recipes/bel-alex73/README.md +++ b/recipes/bel-alex73/README.md @@ -39,7 +39,7 @@ Docker container was created for simplify local running. You can run `docker-pre ## Training - with GPU -You need to upload Coqui-TTS(/mycomputer/TTS/) and storage/ directory(/mycomputer/storage/) to some computer with GPU. We don't need cv-corpus/ and fanetyka/ directories for training. Install gcc, then run `pip install -e .[all,dev,notebooks]` to prepare modules. GlowTTS and HifiGan models should be learned separately based on /storage/filtered_dataset only, i.e. they are not dependent from each other. below means list of GPU ids from zero("0,1,2,3" for systems with 4 GPU). See details on the https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html(multi-gpu training). +You need to upload Coqui-TTS(/mycomputer/TTS/) and storage/ directory(/mycomputer/storage/) to some computer with GPU. We don't need cv-corpus/ and fanetyka/ directories for training. Install gcc, then run `pip install -e .[all,dev,notebooks]` to prepare modules. GlowTTS and HifiGan models should be learned separately based on /storage/filtered_dataset only, i.e. they are not dependent from each other. below means list of GPU ids from zero("0,1,2,3" for systems with 4 GPU). See details on the https://coqui-tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html (multi-gpu training). Current setup created for 24GiB GPU. You need to change batch_size if you have more or less GPU memory. Also, you can try to set lr(learning rate) to lower value in the end of training GlowTTS. diff --git a/recipes/bel-alex73/train_hifigan.py b/recipes/bel-alex73/train_hifigan.py index 3e740b2ff4..78221a9f2b 100644 --- a/recipes/bel-alex73/train_hifigan.py +++ b/recipes/bel-alex73/train_hifigan.py @@ -1,11 +1,8 @@ -import os - -from coqpit import Coqpit from trainer import Trainer, TrainerArgs from TTS.tts.configs.shared_configs import BaseAudioConfig from TTS.utils.audio import AudioProcessor -from TTS.vocoder.configs.hifigan_config import * +from TTS.vocoder.configs.hifigan_config import HifiganConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN diff --git a/recipes/blizzard2013/README.md b/recipes/blizzard2013/README.md index 9dcb739728..75f17a5513 100644 --- a/recipes/blizzard2013/README.md +++ b/recipes/blizzard2013/README.md @@ -9,4 +9,4 @@ To get a license and download link for this dataset, you need to visit the [webs You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset. 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments). -2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation). \ No newline at end of file +2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation). diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index 69800cf7b4..3f18f2c3fb 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -20,4 +20,4 @@ CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/taco --coqpit.output_path $RUN_DIR \ --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ - --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json index c2e526f46c..f422203a31 100644 --- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -122,4 +122,4 @@ "use_gst": false, "use_external_speaker_embedding_file": false, "external_speaker_embedding_file": "../../speakers-vctk-en.json" -} \ No newline at end of file +} diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh index 9468988a99..21c3e0e2d7 100644 --- a/recipes/ljspeech/download_ljspeech.sh +++ b/recipes/ljspeech/download_ljspeech.sh @@ -11,4 +11,4 @@ shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/ -rm LJSpeech-1.1.tar.bz2 \ No newline at end of file +rm LJSpeech-1.1.tar.bz2 diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index 055526b1bc..64fd737b4e 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -65,7 +65,7 @@ model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( - f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" + f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda" ) # INITIALIZE THE AUDIO PROCESSOR diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 8c9a272e81..9839fcb339 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -64,7 +64,7 @@ model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( - f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" + f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda" ) # INITIALIZE THE AUDIO PROCESSOR diff --git a/recipes/ljspeech/fastspeech2/train_fastspeech2.py b/recipes/ljspeech/fastspeech2/train_fastspeech2.py index 93737dba7f..0a7a175605 100644 --- a/recipes/ljspeech/fastspeech2/train_fastspeech2.py +++ b/recipes/ljspeech/fastspeech2/train_fastspeech2.py @@ -67,7 +67,7 @@ model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") # TODO: make compute_attention python callable os.system( - f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" + f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda" ) # INITIALIZE THE AUDIO PROCESSOR diff --git a/recipes/multilingual/cml_yourtts/train_yourtts.py b/recipes/multilingual/cml_yourtts/train_yourtts.py index 25a2fd0a4b..02f901fe73 100644 --- a/recipes/multilingual/cml_yourtts/train_yourtts.py +++ b/recipes/multilingual/cml_yourtts/train_yourtts.py @@ -4,7 +4,6 @@ from trainer import Trainer, TrainerArgs from TTS.bin.compute_embeddings import compute_embeddings -from TTS.bin.resample import resample_files from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples diff --git a/recipes/thorsten_DE/align_tts/train_aligntts.py b/recipes/thorsten_DE/align_tts/train_aligntts.py index 32cfd9967f..42363940f3 100644 --- a/recipes/thorsten_DE/align_tts/train_aligntts.py +++ b/recipes/thorsten_DE/align_tts/train_aligntts.py @@ -30,7 +30,7 @@ run_eval=True, test_delay_epochs=-1, epochs=1000, - text_cleaner="phoneme_cleaners", + text_cleaner="multilingual_phoneme_cleaners", use_phonemes=False, phoneme_language="de", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), diff --git a/recipes/thorsten_DE/glow_tts/train_glowtts.py b/recipes/thorsten_DE/glow_tts/train_glowtts.py index 00c67fb5d8..f7f4a186a2 100644 --- a/recipes/thorsten_DE/glow_tts/train_glowtts.py +++ b/recipes/thorsten_DE/glow_tts/train_glowtts.py @@ -40,7 +40,7 @@ run_eval=True, test_delay_epochs=-1, epochs=1000, - text_cleaner="phoneme_cleaners", + text_cleaner="multilingual_phoneme_cleaners", use_phonemes=True, phoneme_language="de", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), diff --git a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py index a3d0b9db2b..024dcaa31e 100644 --- a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py +++ b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py @@ -45,7 +45,7 @@ test_delay_epochs=-1, epochs=1000, min_audio_len=11050, # need to up min_audio_len to avois speedy speech error - text_cleaner="phoneme_cleaners", + text_cleaner="multilingual_phoneme_cleaners", use_phonemes=True, phoneme_language="de", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), diff --git a/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py b/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py index bc0274f5af..a46e27e91b 100644 --- a/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py @@ -49,7 +49,7 @@ gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], double_decoder_consistency=True, epochs=1000, - text_cleaner="phoneme_cleaners", + text_cleaner="multilingual_phoneme_cleaners", use_phonemes=True, phoneme_language="de", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), diff --git a/recipes/thorsten_DE/vits_tts/train_vits.py b/recipes/thorsten_DE/vits_tts/train_vits.py index 4ffa0f30f6..4b773c3508 100644 --- a/recipes/thorsten_DE/vits_tts/train_vits.py +++ b/recipes/thorsten_DE/vits_tts/train_vits.py @@ -40,7 +40,7 @@ run_eval=True, test_delay_epochs=-1, epochs=1000, - text_cleaner="phoneme_cleaners", + text_cleaner="multilingual_phoneme_cleaners", use_phonemes=True, phoneme_language="de", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), diff --git a/requirements.dev.txt b/requirements.dev.txt index 8c674727d3..74ec0cd80c 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,5 +1,8 @@ -black -coverage -isort -nose2 -pylint==2.10.2 +# Generated via scripts/generate_requirements.py and pre-commit hook. +# Do not edit this file; modify pyproject.toml instead. +black==24.2.0 +coverage[toml]>=7 +nose2>=0.15 +pre-commit>=3 +ruff==0.4.9 +tomli>=2; python_version < '3.11' diff --git a/requirements.ja.txt b/requirements.ja.txt deleted file mode 100644 index 4baab88a91..0000000000 --- a/requirements.ja.txt +++ /dev/null @@ -1,5 +0,0 @@ -# These cause some compatibility issues on some systems and are not strictly necessary -# japanese g2p deps -mecab-python3==1.0.6 -unidic-lite==1.0.8 -cutlet diff --git a/requirements.notebooks.txt b/requirements.notebooks.txt deleted file mode 100644 index 65d3f642c9..0000000000 --- a/requirements.notebooks.txt +++ /dev/null @@ -1 +0,0 @@ -bokeh==1.4.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1f7a44f6d8..0000000000 --- a/requirements.txt +++ /dev/null @@ -1,56 +0,0 @@ -# core deps -numpy==1.22.0;python_version<="3.10" -numpy>=1.24.3;python_version>"3.10" -cython>=0.29.30 -scipy>=1.11.2 -torch>=2.1 -torchaudio -soundfile>=0.12.0 -librosa>=0.10.0 -scikit-learn>=1.3.0 -numba==0.55.1;python_version<"3.9" -numba>=0.57.0;python_version>="3.9" -inflect>=5.6.0 -tqdm>=4.64.1 -anyascii>=0.3.0 -pyyaml>=6.0 -fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail -aiohttp>=3.8.1 -packaging>=23.1 -# deps for examples -flask>=2.0.1 -# deps for inference -pysbd>=0.3.4 -# deps for notebooks -umap-learn>=0.5.1 -pandas>=1.4,<2.0 -# deps for training -matplotlib>=3.7.0 -# coqui stack -trainer>=0.0.32 -# config management -coqpit>=0.0.16 -# chinese g2p deps -jieba -pypinyin -# korean -hangul_romanize -# gruut+supported langs -gruut[de,es,fr]==2.2.3 -# deps for korean -jamo -nltk -g2pkk>=0.1.1 -# deps for bangla -bangla -bnnumerizer -bnunicodenormalizer -#deps for tortoise -einops>=0.6.0 -transformers>=4.33.0 -#deps for bark -encodec>=0.1.1 -# deps for XTTS -unidecode>=1.3.2 -num2words -spacy[ja]>=3 \ No newline at end of file diff --git a/scripts/generate_requirements.py b/scripts/generate_requirements.py new file mode 100644 index 0000000000..bbd32bafd2 --- /dev/null +++ b/scripts/generate_requirements.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +"""Generate requirements/*.txt files from pyproject.toml. + +Adapted from: +https://github.com/numpy/numpydoc/blob/e7c6baf00f5f73a4a8f8318d0cb4e04949c9a5d1/tools/generate_requirements.py +""" + +import sys +from pathlib import Path + +try: # standard module since Python 3.11 + import tomllib as toml +except ImportError: + try: # available for older Python via pip + import tomli as toml + except ImportError: + sys.exit("Please install `tomli` first: `pip install tomli`") + +script_pth = Path(__file__) +repo_dir = script_pth.parent.parent +script_relpth = script_pth.relative_to(repo_dir) +header = [ + f"# Generated via {script_relpth.as_posix()} and pre-commit hook.", + "# Do not edit this file; modify pyproject.toml instead.", +] + + +def generate_requirement_file(name: str, req_list: list[str]) -> None: + req_fname = repo_dir / f"requirements.{name}.txt" + req_fname.write_text("\n".join(header + req_list) + "\n") + + +def main() -> None: + pyproject = toml.loads((repo_dir / "pyproject.toml").read_text()) + generate_requirement_file("dev", pyproject["project"]["optional-dependencies"]["dev"]) + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 1f31cb5dec..0000000000 --- a/setup.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[build_py] -build_lib=temp_build - -[bdist_wheel] -bdist_dir=temp_build - -[install_lib] -build_dir=temp_build diff --git a/setup.py b/setup.py index df14b41adc..1cf2def1d3 100644 --- a/setup.py +++ b/setup.py @@ -20,56 +20,9 @@ # .,*++++::::::++++*,. # `````` -import os -import subprocess -import sys -from packaging.version import Version - import numpy -import setuptools.command.build_py -import setuptools.command.develop from Cython.Build import cythonize -from setuptools import Extension, find_packages, setup - -python_version = sys.version.split()[0] -if Version(python_version) < Version("3.9") or Version(python_version) >= Version("3.12"): - raise RuntimeError("TTS requires python >= 3.9 and < 3.12 " "but your Python version is {}".format(sys.version)) - - -cwd = os.path.dirname(os.path.abspath(__file__)) -with open(os.path.join(cwd, "TTS", "VERSION")) as fin: - version = fin.read().strip() - - -class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors - def run(self): - setuptools.command.build_py.build_py.run(self) - - -class develop(setuptools.command.develop.develop): - def run(self): - setuptools.command.develop.develop.run(self) - - -# The documentation for this feature is in server/README.md -package_data = ["TTS/server/templates/*"] - - -def pip_install(package_name): - subprocess.call([sys.executable, "-m", "pip", "install", package_name]) - - -requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines() -with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f: - requirements_notebooks = f.readlines() -with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f: - requirements_dev = f.readlines() -with open(os.path.join(cwd, "requirements.ja.txt"), "r") as f: - requirements_ja = f.readlines() -requirements_all = requirements_dev + requirements_notebooks + requirements_ja - -with open("README.md", "r", encoding="utf-8") as readme_file: - README = readme_file.read() +from setuptools import Extension, setup exts = [ Extension( @@ -78,64 +31,7 @@ def pip_install(package_name): ) ] setup( - name="TTS", - version=version, - url="https://github.com/coqui-ai/TTS", - author="Eren GÃļlge", - author_email="egolge@coqui.ai", - description="Deep learning for Text to Speech by Coqui.", - long_description=README, - long_description_content_type="text/markdown", - license="MPL-2.0", - # cython include_dirs=numpy.get_include(), ext_modules=cythonize(exts, language_level=3), - # ext_modules=find_cython_extensions(), - # package - include_package_data=True, - packages=find_packages(include=["TTS"], exclude=["*.tests", "*tests.*", "tests.*", "*tests", "tests"]), - package_data={ - "TTS": [ - "VERSION", - ] - }, - project_urls={ - "Documentation": "https://github.com/coqui-ai/TTS/wiki", - "Tracker": "https://github.com/coqui-ai/TTS/issues", - "Repository": "https://github.com/coqui-ai/TTS", - "Discussions": "https://github.com/coqui-ai/TTS/discussions", - }, - cmdclass={ - "build_py": build_py, - "develop": develop, - # 'build_ext': build_ext - }, - install_requires=requirements, - extras_require={ - "all": requirements_all, - "dev": requirements_dev, - "notebooks": requirements_notebooks, - "ja": requirements_ja, - }, - python_requires=">=3.9.0, <3.12", - entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, - classifiers=[ - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Development Status :: 3 - Alpha", - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "Operating System :: POSIX :: Linux", - "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", - "Topic :: Software Development", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Multimedia :: Sound/Audio :: Speech", - "Topic :: Multimedia :: Sound/Audio", - "Topic :: Multimedia", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], zip_safe=False, ) diff --git a/tests/__init__.py b/tests/__init__.py index e102a2dfee..f0a8b2f118 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,8 @@ import os +from trainer.generic_utils import get_cuda + from TTS.config import BaseDatasetConfig -from TTS.utils.generic_utils import get_cuda def get_device_id(): diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh index d7f0ab9d4c..721777f852 100755 --- a/tests/bash_tests/test_compute_statistics.sh +++ b/tests/bash_tests/test_compute_statistics.sh @@ -4,4 +4,3 @@ BASEDIR=$(dirname "$0") echo "$BASEDIR" # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy - diff --git a/tests/data/dummy_speakers.json b/tests/data/dummy_speakers.json index 233533b796..507b57b5a5 100644 --- a/tests/data/dummy_speakers.json +++ b/tests/data/dummy_speakers.json @@ -100222,5 +100222,5 @@ 0.04999300092458725, -0.12125937640666962 ] - } + } } diff --git a/tests/data/ljspeech/metadata_flac.csv b/tests/data/ljspeech/metadata_flac.csv new file mode 100644 index 0000000000..fbde71d07d --- /dev/null +++ b/tests/data/ljspeech/metadata_flac.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3 diff --git a/tests/data/ljspeech/metadata_mp3.csv b/tests/data/ljspeech/metadata_mp3.csv new file mode 100644 index 0000000000..a8c5ec2e76 --- /dev/null +++ b/tests/data/ljspeech/metadata_mp3.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3 diff --git a/tests/data/ljspeech/metadata_wav.csv b/tests/data/ljspeech/metadata_wav.csv new file mode 100644 index 0000000000..1af6652e6a --- /dev/null +++ b/tests/data/ljspeech/metadata_wav.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3 diff --git a/tests/data/ljspeech/wavs/LJ001-0001.flac b/tests/data/ljspeech/wavs/LJ001-0001.flac new file mode 100644 index 0000000000..ed3b009d4f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0001.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0001.mp3 b/tests/data/ljspeech/wavs/LJ001-0001.mp3 new file mode 100644 index 0000000000..da62c8d7f7 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0001.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0002.flac b/tests/data/ljspeech/wavs/LJ001-0002.flac new file mode 100644 index 0000000000..f6a607ea91 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0002.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0002.mp3 b/tests/data/ljspeech/wavs/LJ001-0002.mp3 new file mode 100644 index 0000000000..8eb527924f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0002.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0003.flac b/tests/data/ljspeech/wavs/LJ001-0003.flac new file mode 100644 index 0000000000..05f357a580 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0003.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0003.mp3 b/tests/data/ljspeech/wavs/LJ001-0003.mp3 new file mode 100644 index 0000000000..5bc4449880 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0003.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0004.flac b/tests/data/ljspeech/wavs/LJ001-0004.flac new file mode 100644 index 0000000000..547e7899a8 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0004.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0004.mp3 b/tests/data/ljspeech/wavs/LJ001-0004.mp3 new file mode 100644 index 0000000000..c68a1680f3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0004.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0005.flac b/tests/data/ljspeech/wavs/LJ001-0005.flac new file mode 100644 index 0000000000..94589dbba4 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0005.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0005.mp3 b/tests/data/ljspeech/wavs/LJ001-0005.mp3 new file mode 100644 index 0000000000..99c245b0c2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0005.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0006.flac b/tests/data/ljspeech/wavs/LJ001-0006.flac new file mode 100644 index 0000000000..87d32d339f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0006.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0006.mp3 b/tests/data/ljspeech/wavs/LJ001-0006.mp3 new file mode 100644 index 0000000000..bc6cb81fb3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0006.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0007.flac b/tests/data/ljspeech/wavs/LJ001-0007.flac new file mode 100644 index 0000000000..7e2b0f1de7 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0007.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0007.mp3 b/tests/data/ljspeech/wavs/LJ001-0007.mp3 new file mode 100644 index 0000000000..f1e34d1b87 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0007.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0008.flac b/tests/data/ljspeech/wavs/LJ001-0008.flac new file mode 100644 index 0000000000..6ca201a60b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0008.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0008.mp3 b/tests/data/ljspeech/wavs/LJ001-0008.mp3 new file mode 100644 index 0000000000..ede2f06802 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0008.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0009.flac b/tests/data/ljspeech/wavs/LJ001-0009.flac new file mode 100644 index 0000000000..cd272b5f72 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0009.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0009.mp3 b/tests/data/ljspeech/wavs/LJ001-0009.mp3 new file mode 100644 index 0000000000..1dd97c4892 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0009.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0010.flac b/tests/data/ljspeech/wavs/LJ001-0010.flac new file mode 100644 index 0000000000..875e01b019 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0010.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0010.mp3 b/tests/data/ljspeech/wavs/LJ001-0010.mp3 new file mode 100644 index 0000000000..a763be3cc5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0010.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0011.flac b/tests/data/ljspeech/wavs/LJ001-0011.flac new file mode 100644 index 0000000000..3a45005a1a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0011.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0011.mp3 b/tests/data/ljspeech/wavs/LJ001-0011.mp3 new file mode 100644 index 0000000000..579854e193 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0011.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0012.flac b/tests/data/ljspeech/wavs/LJ001-0012.flac new file mode 100644 index 0000000000..2f78f762b3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0012.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0012.mp3 b/tests/data/ljspeech/wavs/LJ001-0012.mp3 new file mode 100644 index 0000000000..51212f906e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0012.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0013.flac b/tests/data/ljspeech/wavs/LJ001-0013.flac new file mode 100644 index 0000000000..50c7707fbf Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0013.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0013.mp3 b/tests/data/ljspeech/wavs/LJ001-0013.mp3 new file mode 100644 index 0000000000..a457bf9c6e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0013.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0014.flac b/tests/data/ljspeech/wavs/LJ001-0014.flac new file mode 100644 index 0000000000..f8a5fe8823 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0014.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0014.mp3 b/tests/data/ljspeech/wavs/LJ001-0014.mp3 new file mode 100644 index 0000000000..f4a3d66e69 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0014.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0015.flac b/tests/data/ljspeech/wavs/LJ001-0015.flac new file mode 100644 index 0000000000..99523288ba Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0015.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0015.mp3 b/tests/data/ljspeech/wavs/LJ001-0015.mp3 new file mode 100644 index 0000000000..f0db88e17d Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0015.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0016.flac b/tests/data/ljspeech/wavs/LJ001-0016.flac new file mode 100644 index 0000000000..66b7ca9590 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0016.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0016.mp3 b/tests/data/ljspeech/wavs/LJ001-0016.mp3 new file mode 100644 index 0000000000..cd14b20478 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0016.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0017.flac b/tests/data/ljspeech/wavs/LJ001-0017.flac new file mode 100644 index 0000000000..56725cce10 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0017.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0017.mp3 b/tests/data/ljspeech/wavs/LJ001-0017.mp3 new file mode 100644 index 0000000000..ecc9b2a3eb Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0017.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0018.flac b/tests/data/ljspeech/wavs/LJ001-0018.flac new file mode 100644 index 0000000000..ec038cac88 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0018.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0018.mp3 b/tests/data/ljspeech/wavs/LJ001-0018.mp3 new file mode 100644 index 0000000000..33aa8ba163 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0018.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0019.flac b/tests/data/ljspeech/wavs/LJ001-0019.flac new file mode 100644 index 0000000000..6245cc5a07 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0019.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0019.mp3 b/tests/data/ljspeech/wavs/LJ001-0019.mp3 new file mode 100644 index 0000000000..e1844dce8b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0019.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0020.flac b/tests/data/ljspeech/wavs/LJ001-0020.flac new file mode 100644 index 0000000000..41598a10f1 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0020.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0020.mp3 b/tests/data/ljspeech/wavs/LJ001-0020.mp3 new file mode 100644 index 0000000000..7a61c05082 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0020.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0021.flac b/tests/data/ljspeech/wavs/LJ001-0021.flac new file mode 100644 index 0000000000..3ec0eeb340 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0021.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0021.mp3 b/tests/data/ljspeech/wavs/LJ001-0021.mp3 new file mode 100644 index 0000000000..45a6d4ce10 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0021.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0022.flac b/tests/data/ljspeech/wavs/LJ001-0022.flac new file mode 100644 index 0000000000..9db1c6cf36 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0022.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0022.mp3 b/tests/data/ljspeech/wavs/LJ001-0022.mp3 new file mode 100644 index 0000000000..a0464aa254 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0022.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0023.flac b/tests/data/ljspeech/wavs/LJ001-0023.flac new file mode 100644 index 0000000000..621ba660f2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0023.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0023.mp3 b/tests/data/ljspeech/wavs/LJ001-0023.mp3 new file mode 100644 index 0000000000..a6b087f8d4 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0023.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0024.flac b/tests/data/ljspeech/wavs/LJ001-0024.flac new file mode 100644 index 0000000000..4125d10bdd Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0024.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0024.mp3 b/tests/data/ljspeech/wavs/LJ001-0024.mp3 new file mode 100644 index 0000000000..0fee298fc6 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0024.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0025.flac b/tests/data/ljspeech/wavs/LJ001-0025.flac new file mode 100644 index 0000000000..ee0c4b6e05 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0025.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0025.mp3 b/tests/data/ljspeech/wavs/LJ001-0025.mp3 new file mode 100644 index 0000000000..f8c13a10be Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0025.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0026.flac b/tests/data/ljspeech/wavs/LJ001-0026.flac new file mode 100644 index 0000000000..119f26fb5e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0026.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0026.mp3 b/tests/data/ljspeech/wavs/LJ001-0026.mp3 new file mode 100644 index 0000000000..fed88cc961 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0026.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0027.flac b/tests/data/ljspeech/wavs/LJ001-0027.flac new file mode 100644 index 0000000000..ff685ca577 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0027.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0027.mp3 b/tests/data/ljspeech/wavs/LJ001-0027.mp3 new file mode 100644 index 0000000000..bc23ed3199 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0027.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0028.flac b/tests/data/ljspeech/wavs/LJ001-0028.flac new file mode 100644 index 0000000000..151334f660 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0028.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0028.mp3 b/tests/data/ljspeech/wavs/LJ001-0028.mp3 new file mode 100644 index 0000000000..0212403392 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0028.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0029.flac b/tests/data/ljspeech/wavs/LJ001-0029.flac new file mode 100644 index 0000000000..65586b6c0a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0029.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0029.mp3 b/tests/data/ljspeech/wavs/LJ001-0029.mp3 new file mode 100644 index 0000000000..f20eb0dfd2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0029.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0030.flac b/tests/data/ljspeech/wavs/LJ001-0030.flac new file mode 100644 index 0000000000..411553c121 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0030.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0030.mp3 b/tests/data/ljspeech/wavs/LJ001-0030.mp3 new file mode 100644 index 0000000000..7d46fbef9a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0030.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0031.flac b/tests/data/ljspeech/wavs/LJ001-0031.flac new file mode 100644 index 0000000000..b9f4fa683b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0031.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0031.mp3 b/tests/data/ljspeech/wavs/LJ001-0031.mp3 new file mode 100644 index 0000000000..6842943c27 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0031.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0032.flac b/tests/data/ljspeech/wavs/LJ001-0032.flac new file mode 100644 index 0000000000..9166a9d5d5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0032.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0032.mp3 b/tests/data/ljspeech/wavs/LJ001-0032.mp3 new file mode 100644 index 0000000000..cf5abb648b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0032.mp3 differ diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index cbd98fc0c5..252b429a16 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -8,7 +8,8 @@ from tests import get_tests_data_path, get_tests_output_path from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig -from TTS.tts.datasets import TTSDataset, load_tts_samples +from TTS.tts.datasets import load_tts_samples +from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -21,15 +22,30 @@ c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 c.data_path = os.path.join(get_tests_data_path(), "ljspeech/") -ok_ljspeech = os.path.exists(c.data_path) -dataset_config = BaseDatasetConfig( - formatter="ljspeech_test", # ljspeech_test to multi-speaker - meta_file_train="metadata.csv", +dataset_config_wav = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_wav.csv", meta_file_val=None, path=c.data_path, language="en", ) +dataset_config_mp3 = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_mp3.csv", + meta_file_val=None, + path=c.data_path, + language="en", +) +dataset_config_flac = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_flac.csv", + meta_file_val=None, + path=c.data_path, + language="en", +) + +dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] DATA_EXIST = True if not os.path.exists(c.data_path): @@ -44,11 +60,10 @@ def __init__(self, *args, **kwargs): self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) - def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): + def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, @@ -64,6 +79,11 @@ def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + dataloader = DataLoader( dataset, batch_size=batch_size, @@ -75,9 +95,8 @@ def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): return dataloader, dataset def test_loader(self): - if ok_ljspeech: - dataloader, dataset = self._create_dataloader(1, 1, 0) - + for dataset_config in dataset_configs: + dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break @@ -104,8 +123,6 @@ def test_loader(self): # make sure that the computed mels and the waveform match and correctly computed mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # remove padding in mel-spectrogram - mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]] # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding mel_new = mel_new[:, : mel_lengths[0]] ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) @@ -124,40 +141,38 @@ def test_loader(self): self.assertGreaterEqual(mel_input.min(), 0) def test_batch_group_shuffle(self): - if ok_ljspeech: - dataloader, dataset = self._create_dataloader(2, c.r, 16) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) + dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + self.assertGreaterEqual(avg_length, last_length) + self.assertTrue(is_items_reordered) def test_start_by_longest(self): """Test start_by_longest option. Ther first item of the fist batch must be longer than all the other items. """ - if ok_ljspeech: - dataloader, _ = self._create_dataloader(2, c.r, 0, True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) + dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + self.assertTrue(all(max_len >= mel_lengths)) def test_padding_and_spectrograms(self): def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): @@ -172,71 +187,70 @@ def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - if ok_ljspeech: - dataloader, _ = self._create_dataloader(1, 1, 0) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 - - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) + dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # check mel_spec consistency + wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) + mel = self.ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) + + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = self.ap.inv_melspectrogram(mel_spec.T) + self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") + + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = self.ap.inv_spectrogram(linear_spec.T) + self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") + + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + + # Test for batch size 2 + dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + + # check the other item in the batch + self.assertEqual(linear_input[1 - idx, -1].sum(), 0) + self.assertEqual(mel_input[1 - idx, -1].sum(), 0) + self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) + self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) + self.assertEqual(len(mel_lengths.shape), 1) - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 diff --git a/tests/inputs/common_voice.tsv b/tests/inputs/common_voice.tsv index 39fc4190ac..b4351d6739 100644 --- a/tests/inputs/common_voice.tsv +++ b/tests/inputs/common_voice.tsv @@ -1,6 +1,6 @@ client_id path sentence up_votes down_votes age gender accent locale segment -95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en -95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en -95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en -954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en -954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en +95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en +95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en +95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en +954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en +954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en diff --git a/tests/inputs/dummy_model_config.json b/tests/inputs/dummy_model_config.json index b51bb3a871..3f64c7f3df 100644 --- a/tests/inputs/dummy_model_config.json +++ b/tests/inputs/dummy_model_config.json @@ -98,5 +98,3 @@ "gst_style_tokens": 10 } } - - diff --git a/tests/inputs/language_ids.json b/tests/inputs/language_ids.json index 27bb15206f..80833d8058 100644 --- a/tests/inputs/language_ids.json +++ b/tests/inputs/language_ids.json @@ -2,4 +2,4 @@ "en": 0, "fr-fr": 1, "pt-br": 2 -} \ No newline at end of file +} diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts.json index 3f928c7e92..80721346d5 100644 --- a/tests/inputs/test_align_tts.json +++ b/tests/inputs/test_align_tts.json @@ -155,4 +155,4 @@ "meta_file_attn_mask": null } ] -} \ No newline at end of file +} diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json index bfcc17ab0e..ae125f1327 100644 --- a/tests/inputs/test_speaker_encoder_config.json +++ b/tests/inputs/test_speaker_encoder_config.json @@ -58,4 +58,4 @@ "storage_size": 15 // the size of the in-memory storage with respect to a single batch }, "datasets":null -} \ No newline at end of file +} diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index 4a7eea5ded..93e4790ca3 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -152,4 +152,4 @@ "meta_file_attn_mask": "tests/data/ljspeech/metadata_attn_mask.txt" } ] -} \ No newline at end of file +} diff --git a/tests/inputs/test_vocoder_audio_config.json b/tests/inputs/test_vocoder_audio_config.json index 08acc48cd3..cdf347c4eb 100644 --- a/tests/inputs/test_vocoder_audio_config.json +++ b/tests/inputs/test_vocoder_audio_config.json @@ -21,4 +21,3 @@ "do_trim_silence": false } } - diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index 82afc97727..2b6cc9e4cd 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -163,4 +163,3 @@ // PATHS "output_path": "tests/train_outputs/" } - diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index 6378c07a6d..bb06bf2448 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -113,4 +113,3 @@ // PATHS "output_path": "tests/train_outputs/" } - diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index ee4e5f8e42..1dd8a229f2 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -109,4 +109,3 @@ // PATHS "output_path": "tests/train_outputs/" } - diff --git a/tests/inputs/xtts_vocab.json b/tests/inputs/xtts_vocab.json index a3c6dcec77..e25b4e4863 100644 --- a/tests/inputs/xtts_vocab.json +++ b/tests/inputs/xtts_vocab.json @@ -12666,4 +12666,4 @@ "da kara" ] } -} \ No newline at end of file +} diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index 8810554421..f9067530e6 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -116,6 +116,12 @@ def setUp(self): output = self.phonemizer.phonemize(text, separator="") self.assertEqual(output, gt) + # UTF8 characters + text = "Åērebię" + gt = "ʑrˈɛbjɛ" + output = ESpeak("pl").phonemize(text, separator="") + self.assertEqual(output, gt) + def test_name(self): self.assertEqual(self.phonemizer.name(), "espeak") @@ -234,8 +240,12 @@ def test_is_available(self): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" - self._EXPECTED = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" + self._TEST_CASES = ( + "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" + ) + self._EXPECTED = ( + "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" + ) def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index fcfa71e77d..bf0c8d5d8a 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners +from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, phoneme_cleaners def test_time() -> None: @@ -19,3 +19,8 @@ def test_currency() -> None: def test_expand_numbers() -> None: assert phoneme_cleaners("-1") == "minus one" assert phoneme_cleaners("1") == "one" + + +def test_multilingual_phoneme_cleaners() -> None: + assert multilingual_phoneme_cleaners("(Hello)") == "Hello" + assert multilingual_phoneme_cleaners("1:") == "1," diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py index 23bb440a0a..d07efa3620 100644 --- a/tests/tts_tests/test_helpers.py +++ b/tests/tts_tests/test_helpers.py @@ -3,7 +3,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask -def average_over_durations_test(): # pylint: disable=no-self-use +def test_average_over_durations(): # pylint: disable=no-self-use pitch = T.rand(1, 1, 128) durations = T.randint(1, 5, (1, 21)) @@ -21,7 +21,7 @@ def average_over_durations_test(): # pylint: disable=no-self-use index += dur -def seqeunce_mask_test(): +def test_sequence_mask(): lengths = T.randint(10, 15, (8,)) mask = sequence_mask(lengths) for i in range(8): @@ -30,8 +30,8 @@ def seqeunce_mask_test(): assert mask[i, l:].sum() == 0 -def segment_test(): - x = T.range(0, 11) +def test_segment(): + x = T.arange(0, 12) x = x.repeat(8, 1).unsqueeze(1) segment_ids = T.randint(0, 7, (8,)) @@ -50,11 +50,11 @@ def segment_test(): assert x[idx, :, start_indx : start_indx + 10].sum() == segments[idx, :, :].sum() -def rand_segments_test(): +def test_rand_segments(): x = T.rand(2, 3, 4) x_lens = T.randint(3, 4, (2,)) - segments, seg_idxs = rand_segments(x, x_lens, segment_size=3) - assert segments.shape == (2, 3, 3) + segments, seg_idxs = rand_segments(x, x_lens, segment_size=2) + assert segments.shape == (2, 3, 2) assert all(seg_idxs >= 0), seg_idxs try: segments, _ = rand_segments(x, x_lens, segment_size=5) @@ -68,10 +68,10 @@ def rand_segments_test(): assert all(x_lens_back == x_lens) -def generate_path_test(): +def test_generate_path(): durations = T.randint(1, 4, (10, 21)) x_length = T.randint(18, 22, (10,)) - x_mask = sequence_mask(x_length).unsqueeze(1).long() + x_mask = sequence_mask(x_length, max_len=21).unsqueeze(1).long() durations = durations * x_mask.squeeze(1) y_length = durations.sum(1) y_mask = sequence_mask(y_length).unsqueeze(1).long() diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py index 522b7bb17c..794478dca3 100644 --- a/tests/tts_tests/test_losses.py +++ b/tests/tts_tests/test_losses.py @@ -216,7 +216,7 @@ def test_in_out(self): # pylint: disable=no-self-use late_x = -200.0 * sequence_mask(length + 1, 100).float() + 100.0 # simulate logits on late stopping loss = layer(true_x, target, length) - self.assertEqual(loss.item(), 0.0) + self.assertAlmostEqual(loss.item(), 0.0) loss = layer(early_x, target, length) self.assertAlmostEqual(loss.item(), 2.1053, places=4) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index b1bdeb9fd1..72b6bcd46b 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -278,7 +278,7 @@ def test_train_step(): }, ) - batch = dict({}) + batch = {} batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device) batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device) batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0] diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 906ec3d09f..7ec3f0df1b 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -4,6 +4,7 @@ import torch from torch import nn, optim +from trainer.generic_utils import count_parameters from tests import get_tests_input_path from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig @@ -24,11 +25,6 @@ WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): @@ -266,7 +262,7 @@ def test_train_step(): }, ) - batch = dict({}) + batch = {} batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device) batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device) batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0] diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index fca9955619..17992773ad 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -64,7 +64,6 @@ def test_load_audio(self): def test_dataset(self): """TODO:""" - ... def test_init_multispeaker(self): num_speakers = 10 @@ -213,7 +212,7 @@ def test_d_vector_forward(self): d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")], ) config = VitsConfig(model_args=args) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) model.train() input_dummy, input_lengths, _, spec, spec_lengths, waveform = self._create_inputs(config, batch_size=batch_size) d_vectors = torch.randn(batch_size, 256).to(device) @@ -358,7 +357,7 @@ def test_d_vector_inference(self): d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")], ) config = VitsConfig(model_args=args) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) model.eval() # batch size = 1 input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) @@ -512,7 +511,7 @@ def test_train_step_upsampling_interpolation(self): def test_train_eval_log(self): batch_size = 2 config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) model.run_data_dep_init = False model.train() batch = self._create_batch(config, batch_size) @@ -531,7 +530,7 @@ def test_train_eval_log(self): def test_test_run(self): config = VitsConfig(model_args=VitsArgs(num_chars=32)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) model.run_data_dep_init = False model.eval() test_figures, test_audios = model.test_run(None) @@ -541,7 +540,7 @@ def test_test_run(self): def test_load_checkpoint(self): chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth") config = VitsConfig(VitsArgs(num_chars=32)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) chkp = {} chkp["model"] = model.state_dict() torch.save(chkp, chkp_path) @@ -552,20 +551,20 @@ def test_load_checkpoint(self): def test_get_criterion(self): config = VitsConfig(VitsArgs(num_chars=32)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) criterion = model.get_criterion() self.assertTrue(criterion is not None) def test_init_from_config(self): config = VitsConfig(model_args=VitsArgs(num_chars=32)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) self.assertTrue(not hasattr(model, "emb_g")) config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2, use_speaker_embedding=True)) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) self.assertEqual(model.num_speakers, 2) self.assertTrue(hasattr(model, "emb_g")) @@ -577,7 +576,7 @@ def test_init_from_config(self): speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), ) ) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) self.assertEqual(model.num_speakers, 10) self.assertTrue(hasattr(model, "emb_g")) @@ -589,7 +588,7 @@ def test_init_from_config(self): d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")], ) ) - model = Vits.init_from_config(config, verbose=False).to(device) + model = Vits.init_from_config(config).to(device) self.assertTrue(model.num_speakers == 1) self.assertTrue(not hasattr(model, "emb_g")) self.assertTrue(model.embedded_speaker_dim == config.d_vector_dim) diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index 2a723f105f..3c7ac51556 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -4,6 +4,7 @@ import torch from torch import optim +from trainer.generic_utils import count_parameters from trainer.logging.tensorboard_logger import TensorboardLogger from tests import get_tests_data_path, get_tests_input_path, get_tests_output_path @@ -26,11 +27,6 @@ BATCH_SIZE = 3 -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - class TestGlowTTS(unittest.TestCase): @staticmethod def _create_inputs(batch_size=8): @@ -136,7 +132,7 @@ def _test_forward_with_d_vector(self, batch_size): d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # inference encoder and decoder with MAS @@ -162,7 +158,7 @@ def _test_forward_with_speaker_id(self, batch_size): use_speaker_embedding=True, num_speakers=24, ) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # inference encoder and decoder with MAS @@ -210,7 +206,7 @@ def _test_inference_with_d_vector(self, batch_size): d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) model.eval() outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "d_vectors": d_vector}) self._assert_inference_outputs(outputs, input_dummy, mel_spec) @@ -228,7 +224,7 @@ def _test_inference_with_speaker_ids(self, batch_size): use_speaker_embedding=True, num_speakers=24, ) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) outputs = model.inference(input_dummy, {"x_lengths": input_lengths, "speaker_ids": speaker_ids}) self._assert_inference_outputs(outputs, input_dummy, mel_spec) @@ -303,7 +299,7 @@ def test_train_eval_log(self): batch["d_vectors"] = None batch["speaker_ids"] = None config = GlowTTSConfig(num_chars=32) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) model.run_data_dep_init = False model.train() logger = TensorboardLogger( @@ -317,7 +313,7 @@ def test_train_eval_log(self): def test_test_run(self): config = GlowTTSConfig(num_chars=32) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) model.run_data_dep_init = False model.eval() test_figures, test_audios = model.test_run(None) @@ -327,7 +323,7 @@ def test_test_run(self): def test_load_checkpoint(self): chkp_path = os.path.join(get_tests_output_path(), "dummy_glow_tts_checkpoint.pth") config = GlowTTSConfig(num_chars=32) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) chkp = {} chkp["model"] = model.state_dict() torch.save(chkp, chkp_path) @@ -338,21 +334,21 @@ def test_load_checkpoint(self): def test_get_criterion(self): config = GlowTTSConfig(num_chars=32) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) criterion = model.get_criterion() self.assertTrue(criterion is not None) def test_init_from_config(self): config = GlowTTSConfig(num_chars=32) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) config = GlowTTSConfig(num_chars=32, num_speakers=2) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) self.assertTrue(model.num_speakers == 2) self.assertTrue(not hasattr(model, "emb_g")) config = GlowTTSConfig(num_chars=32, num_speakers=2, use_speaker_embedding=True) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) self.assertTrue(model.num_speakers == 2) self.assertTrue(hasattr(model, "emb_g")) @@ -362,7 +358,7 @@ def test_init_from_config(self): use_speaker_embedding=True, speakers_file=os.path.join(get_tests_data_path(), "ljspeech", "speakers.json"), ) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) self.assertTrue(model.num_speakers == 10) self.assertTrue(hasattr(model, "emb_g")) @@ -372,7 +368,7 @@ def test_init_from_config(self): d_vector_dim=256, d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"), ) - model = GlowTTS.init_from_config(config, verbose=False).to(device) + model = GlowTTS.init_from_config(config).to(device) self.assertTrue(model.num_speakers == 1) self.assertTrue(not hasattr(model, "emb_g")) self.assertTrue(model.c_in_channels == config.d_vector_dim) diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index a4a4f72679..c90551b494 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -2,10 +2,10 @@ import unittest import torch +from trainer.generic_utils import count_parameters from tests import get_tests_input_path -from TTS.vc.configs.freevc_config import FreeVCConfig -from TTS.vc.models.freevc import FreeVC +from TTS.vc.models.freevc import FreeVC, FreeVCConfig # pylint: disable=unused-variable # pylint: disable=no-self-use @@ -20,11 +20,6 @@ BATCH_SIZE = 3 -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - class TestFreeVC(unittest.TestCase): def _create_inputs(self, config, batch_size=2): input_dummy = torch.rand(batch_size, 30 * config.audio["hop_length"]).to(device) @@ -116,20 +111,14 @@ def test_voice_conversion(self): output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0] ), f"{output_wav.shape} != {source_wav.shape}" - def test_train_step(self): - ... + def test_train_step(self): ... - def test_train_eval_log(self): - ... + def test_train_eval_log(self): ... - def test_test_run(self): - ... + def test_test_run(self): ... - def test_load_checkpoint(self): - ... + def test_load_checkpoint(self): ... - def test_get_criterion(self): - ... + def test_get_criterion(self): ... - def test_init_from_config(self): - ... + def test_init_from_config(self): ... diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index fe56ee783f..9b10759505 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -1,43 +1,54 @@ import glob import os import shutil +import unittest from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavegradConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + +class WavegradTrainingTest(unittest.TestCase): + # TODO: Reactivate after improving CI run times + # This test currently takes ~2h on CI (15min/step vs 8sec/step locally) + if os.getenv("GITHUB_ACTIONS") == "true": + __test__ = False + + def test_train(self): # pylint: disable=no-self-use + config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") + output_path = os.path.join(get_tests_output_path(), "train_outputs") + + config = WavegradConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=8192, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 8fa56e287a..b944423988 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -4,11 +4,11 @@ import shutil import torch +from trainer.io import get_user_data_dir from tests import get_tests_data_path, get_tests_output_path, run_cli from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.manage import ModelManager MODELS_WITH_SEP_TESTS = [ @@ -50,13 +50,13 @@ def run_models(offset=0, step=1): speaker_id = list(speaker_manager.name_to_id.keys())[0] run_cli( f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --no-progress_bar' ) else: # single-speaker model run_cli( f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' ) # remove downloaded models shutil.rmtree(local_download_dir) @@ -66,7 +66,7 @@ def run_models(offset=0, step=1): reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") run_cli( f"tts --model_name {model_name} " - f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --progress_bar False' + f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar' ) else: # only download the model @@ -83,14 +83,14 @@ def test_xtts(): run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) else: run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) @@ -138,14 +138,14 @@ def test_xtts_v2(): run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' ) else: run_cli( "yes | " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' ) @@ -215,12 +215,12 @@ def test_tortoise(): if use_gpu: run_cli( f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' ) else: run_cli( f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' ) @@ -231,12 +231,12 @@ def test_bark(): if use_gpu: run_cli( f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' ) else: run_cli( f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --progress_bar False' + f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' ) @@ -249,7 +249,7 @@ def test_voice_conversion(): output_path = os.path.join(get_tests_output_path(), "output.wav") run_cli( f"tts --model_name {model_name}" - f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --progress_bar False" + f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar" )