From 75314de10556a6fa129ac1598e34fdcbbab74fe4 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 26 Jun 2024 03:58:26 +0900 Subject: [PATCH] =?UTF-8?q?=E6=95=B4=E7=90=86:=20=E5=90=88=E6=88=90?= =?UTF-8?q?=E7=B3=BB=E3=81=AE=E3=82=B3=E3=82=A2=E7=9B=B4=E6=8E=A5=E5=91=BC?= =?UTF-8?q?=E3=81=B3=E5=87=BA=E3=81=97=E3=82=92=20`TTSEngine`=20=E3=81=B8?= =?UTF-8?q?=E7=A7=BB=E5=8B=95=20(#1420)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: `default_sampling_rate` を TTSEngine へ移動 * refactor: 初期化をリネーム * refactor: 初期化に関する docstring を明確化 * refactor: `.supported_devices` を `TTSEngine` へ移動 * refactor: `.get_core()` エラーのチェックに用いる API を変更 * fix: `default_sampling_rate` の移動に追従 * fix: lint * refactor: サンプリングレートに関する docstring を追加 --- test/e2e/test_missing_core.py | 2 +- voicevox_engine/app/application.py | 4 +++- voicevox_engine/app/routers/engine_info.py | 7 +++++-- voicevox_engine/app/routers/morphing.py | 1 - voicevox_engine/app/routers/tts_pipeline.py | 17 +++++++---------- voicevox_engine/morphing/morphing.py | 4 +--- voicevox_engine/tts_pipeline/tts_engine.py | 21 +++++++++++++++++++-- 7 files changed, 36 insertions(+), 20 deletions(-) diff --git a/test/e2e/test_missing_core.py b/test/e2e/test_missing_core.py index 39e241fa2..2365cecd4 100644 --- a/test/e2e/test_missing_core.py +++ b/test/e2e/test_missing_core.py @@ -6,6 +6,6 @@ def test_missing_core_422(client: TestClient, snapshot_json: SnapshotAssertion) -> None: """存在しないコアを指定するとエラーを返す。""" - response = client.get("/supported_devices", params={"core_version": "4.0.4"}) + response = client.get("/speakers", params={"core_version": "4.0.4"}) assert response.status_code == 422 assert snapshot_json == response.json() diff --git a/voicevox_engine/app/application.py b/voicevox_engine/app/application.py index 3862838ed..862b615e4 100644 --- a/voicevox_engine/app/application.py +++ b/voicevox_engine/app/application.py @@ -85,7 +85,9 @@ def generate_app( generate_library_router(library_manager, verify_mutability_allowed) ) app.include_router(generate_user_dict_router(user_dict, verify_mutability_allowed)) - app.include_router(generate_engine_info_router(core_manager, engine_manifest)) + app.include_router( + generate_engine_info_router(core_manager, tts_engines, engine_manifest) + ) app.include_router( generate_setting_router( setting_loader, engine_manifest.brand_name, verify_mutability_allowed diff --git a/voicevox_engine/app/routers/engine_info.py b/voicevox_engine/app/routers/engine_info.py index 96f22b4d0..5d4ca5b44 100644 --- a/voicevox_engine/app/routers/engine_info.py +++ b/voicevox_engine/app/routers/engine_info.py @@ -10,6 +10,7 @@ from voicevox_engine.core.core_adapter import DeviceSupport from voicevox_engine.core.core_initializer import CoreManager from voicevox_engine.engine_manifest import EngineManifest +from voicevox_engine.tts_pipeline.tts_engine import TTSEngineManager class SupportedDevicesInfo(BaseModel): @@ -32,7 +33,9 @@ def generate_from(cls, device_support: DeviceSupport) -> Self: def generate_engine_info_router( - core_manager: CoreManager, engine_manifest_data: EngineManifest + core_manager: CoreManager, + tts_engine_manager: TTSEngineManager, + engine_manifest_data: EngineManifest, ) -> APIRouter: """エンジン情報 API Router を生成する""" router = APIRouter(tags=["その他"]) @@ -53,7 +56,7 @@ def supported_devices( ) -> SupportedDevicesInfo: """対応デバイスの一覧を取得します。""" version = core_version or core_manager.latest_version() - supported_devices = core_manager.get_core(version).supported_devices + supported_devices = tts_engine_manager.get_engine(version).supported_devices if supported_devices is None: raise HTTPException(status_code=422, detail="非対応の機能です。") return SupportedDevicesInfo.generate_from(supported_devices) diff --git a/voicevox_engine/app/routers/morphing.py b/voicevox_engine/app/routers/morphing.py index b4fcd734a..1c6ff7d9d 100644 --- a/voicevox_engine/app/routers/morphing.py +++ b/voicevox_engine/app/routers/morphing.py @@ -112,7 +112,6 @@ def _synthesis_morphing( # 生成したパラメータはキャッシュされる morph_param = synthesis_morphing_parameter( engine=engine, - core=core, query=query, base_style_id=base_style_id, target_style_id=target_style_id, diff --git a/voicevox_engine/app/routers/tts_pipeline.py b/voicevox_engine/app/routers/tts_pipeline.py index 4a2159a09..b9b67617e 100644 --- a/voicevox_engine/app/routers/tts_pipeline.py +++ b/voicevox_engine/app/routers/tts_pipeline.py @@ -87,7 +87,6 @@ def audio_query( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) accent_phrases = engine.create_accent_phrases(text, style_id) return AudioQuery( accent_phrases=accent_phrases, @@ -99,7 +98,7 @@ def audio_query( postPhonemeLength=0.1, pauseLength=None, pauseLengthScale=1, - outputSamplingRate=core.default_sampling_rate, + outputSamplingRate=engine.default_sampling_rate, outputStereo=False, kana=create_kana(accent_phrases), ) @@ -119,7 +118,6 @@ def audio_query_from_preset( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) try: presets = preset_manager.load_presets() except PresetInputError as err: @@ -146,7 +144,7 @@ def audio_query_from_preset( postPhonemeLength=selected_preset.postPhonemeLength, pauseLength=selected_preset.pauseLength, pauseLengthScale=selected_preset.pauseLengthScale, - outputSamplingRate=core.default_sampling_rate, + outputSamplingRate=engine.default_sampling_rate, outputStereo=False, kana=create_kana(accent_phrases), ) @@ -378,7 +376,6 @@ def sing_frame_audio_query( """ version = core_version or core_manager.latest_version() engine = tts_engines.get_engine(version) - core = core_manager.get_core(version) try: phonemes, f0, volume = engine.create_sing_phoneme_and_f0_and_volume( score, style_id @@ -391,7 +388,7 @@ def sing_frame_audio_query( volume=volume, phonemes=phonemes, volumeScale=1, - outputSamplingRate=core.default_sampling_rate, + outputSamplingRate=engine.default_sampling_rate, outputStereo=False, ) @@ -532,8 +529,8 @@ def initialize_speaker( 実行しなくても他のAPIは使用できますが、初回実行時に時間がかかることがあります。 """ version = core_version or core_manager.latest_version() - core = core_manager.get_core(version) - core.initialize_style_id_synthesis(style_id, skip_reinit=skip_reinit) + engine = tts_engines.get_engine(version) + engine.initialize_synthesis(style_id, skip_reinit=skip_reinit) @router.get("/is_initialized_speaker", tags=["その他"]) def is_initialized_speaker( @@ -544,7 +541,7 @@ def is_initialized_speaker( 指定されたスタイルが初期化されているかどうかを返します。 """ version = core_version or core_manager.latest_version() - core = core_manager.get_core(version) - return core.is_initialized_style_id_synthesis(style_id) + engine = tts_engines.get_engine(version) + return engine.is_synthesis_initialized(style_id) return router diff --git a/voicevox_engine/morphing/morphing.py b/voicevox_engine/morphing/morphing.py index b426b26e0..d58742ea1 100644 --- a/voicevox_engine/morphing/morphing.py +++ b/voicevox_engine/morphing/morphing.py @@ -14,7 +14,6 @@ from voicevox_engine.morphing.model import MorphableTargetInfo -from ..core.core_adapter import CoreAdapter from ..metas.Metas import Speaker, StyleId from ..model import AudioQuery from ..tts_pipeline.tts_engine import TTSEngine @@ -98,7 +97,6 @@ def is_morphable( def synthesis_morphing_parameter( engine: TTSEngine, - core: CoreAdapter, query: AudioQuery, base_style_id: StyleId, target_style_id: StyleId, @@ -106,7 +104,7 @@ def synthesis_morphing_parameter( query = deepcopy(query) # 不具合回避のためデフォルトのサンプリングレートでWORLDに掛けた後に指定のサンプリングレートに変換する - query.outputSamplingRate = core.default_sampling_rate + query.outputSamplingRate = engine.default_sampling_rate # WORLDに掛けるため合成はモノラルで行う query.outputStereo = False diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 9d3c57248..373d65c11 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -8,7 +8,7 @@ from numpy.typing import NDArray from soxr import resample -from ..core.core_adapter import CoreAdapter +from ..core.core_adapter import CoreAdapter, DeviceSupport from ..core.core_initializer import CoreManager from ..core.core_wrapper import CoreWrapper from ..metas.Metas import StyleId @@ -438,7 +438,16 @@ class TTSEngine: def __init__(self, core: CoreWrapper): super().__init__() self._core = CoreAdapter(core) - # NOTE: self._coreは将来的に消す予定 + + @property + def default_sampling_rate(self) -> int: + """合成される音声波形のデフォルトサンプリングレートを取得する。""" + return self._core.default_sampling_rate + + @property + def supported_devices(self) -> DeviceSupport | None: + """合成時に各デバイスが利用可能か否かの一覧を取得する。""" + return self._core.supported_devices def update_length( self, accent_phrases: list[AccentPhrase], style_id: StyleId @@ -574,6 +583,14 @@ def synthesize_wave( wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave + def initialize_synthesis(self, style_id: StyleId, skip_reinit: bool) -> None: + """指定されたスタイル ID に関する合成機能を初期化する。既に初期化されていた場合は引数に応じて再初期化する。""" + self._core.initialize_style_id_synthesis(style_id, skip_reinit=skip_reinit) + + def is_synthesis_initialized(self, style_id: StyleId) -> bool: + """指定されたスタイル ID に関する合成機能が初期化済みか否かを取得する。""" + return self._core.is_initialized_style_id_synthesis(style_id) + # FIXME: sing用のエンジンに移すかクラス名変える # 返す値の総称を考え、関数名を変更する def create_sing_phoneme_and_f0_and_volume(