From c33e40e1bb15abbe873627b8b2771e39583b3bda Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Sun, 23 Jun 2024 23:09:22 -0700 Subject: [PATCH] 3.1.1 (#821) --- docs/source/changelog/changelog_3.0.rst | 9 +++ docs/source/user_guide/concepts/hmm.md | 35 +++++++- montreal_forced_aligner/abc.py | 2 +- .../acoustic_modeling/monophone.py | 7 +- .../acoustic_modeling/triphone.py | 17 +++- montreal_forced_aligner/alignment/base.py | 1 + montreal_forced_aligner/command_line/align.py | 6 +- .../command_line/validate.py | 2 +- .../corpus/acoustic_corpus.py | 12 ++- montreal_forced_aligner/corpus/helper.py | 2 + .../corpus/multiprocessing.py | 9 ++- montreal_forced_aligner/corpus/text_corpus.py | 13 +-- montreal_forced_aligner/dictionary/mixins.py | 9 +-- .../dictionary/multispeaker.py | 2 +- montreal_forced_aligner/helper.py | 79 +++++++++++++++++-- montreal_forced_aligner/models.py | 58 +++++++------- .../tokenization/simple.py | 8 +- .../vad/multiprocessing.py | 3 +- tests/test_helper.py | 7 +- 19 files changed, 207 insertions(+), 74 deletions(-) diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst index 8025ac35..d6428a76 100644 --- a/docs/source/changelog/changelog_3.0.rst +++ b/docs/source/changelog/changelog_3.0.rst @@ -5,6 +5,15 @@ 3.0 Changelog ************* +3.1.1 +----- + +- Fixed a bug where hidden files and folders would be parsed as corpus data +- Fixed a bug where validation would not respect :code:`--no_final_clean` +- Fixed a rare crash in training when a job would not have utterances assigned to it +- Fixed a bug where MFA would mistakenly report a dictionary and acoustic model phones did not match for older versions + + 3.1.0 ----- diff --git a/docs/source/user_guide/concepts/hmm.md b/docs/source/user_guide/concepts/hmm.md index 2da93859..0bc07cba 100644 --- a/docs/source/user_guide/concepts/hmm.md +++ b/docs/source/user_guide/concepts/hmm.md @@ -22,4 +22,37 @@ Still under construction, I hope to fill these sections out as I have time. ### MFA topology -MFA uses a variable 5-state topology for modeling phones. Each state has a likelihood to transition to the final state in addition to the next state. What this is means is that each phone has a minimum duration of 10ms (corresponding to the default time step for MFCC generation), rather than 30ms for a more standard 3-state HMM. Having a shorter minimum duration reduces alignment errors from short or dropped phones, i.e., American English flaps or schwas, or accommodate for dictionary errors (though these should still be fixed). +MFA uses a variable 3-state topology for modeling phones. Each state has a likelihood to transition to the final state in addition to the next state. What this is means is that each phone has a minimum duration of 10ms (corresponding to the default time step for MFCC generation), rather than 30ms for a more standard 3-state HMM. Having a shorter minimum duration reduces alignment errors from short or dropped phones, i.e., American English flaps or schwas, or accommodate for dictionary errors (though these should still be fixed). + +#### Customizing topologies + +Custom numbers of states can be specified via a topology configuration file. The configuration file should list per-phone minimum and maximum states, as below. + +```{code}yaml +tʃ: + - min_states: 3 + - max_states: 5 +ɾ: + - min_states: 1 + - max_states: 1 +``` + +In the above example, the {ipa_inline}`[tʃ]` phone will have a variable topology with a minimum 3 states before terminating, but optional 5 states to cover additional transitions for the complex articulation. Conversely, the {ipa_inline}`[ɾ]` phone is a very short articulation and so having both minimum and maximum set to 1 state ensures that additional states are not used to model the phone. + +```{seealso} +* [Example configuration files](https://github.com/MontrealCorpusTools/mfa-models/tree/main/config/acoustic/topologies) +``` + +## Clustering phones + +In a monophone model, each phone is modeled the same regardless of the surrounding phonological context. Consider the {ipa_inline}`[P]` in the words {ipa_inline}`paid [P EY1 D]` and {ipa_inline}`spade [S P EY1 D]` in English. The actual pronunciation of the {ipa_inline}`[P]` in paid will be an aspirated {ipa_inline}`[pʰ]` but the pronunciation of {ipa_inline}`[P]` following {ipa_inline}`[S]` is an unaspirated {ipa_inline}`[p]`. + +To more accurately model these phonological variants, we use triphone models. Under the hood, each phone gets transformed into a sequence of three phones, including the phone and its preceding and following phones. So the representation for "paid" and "spade" becomes {ipa_inline}`[#/P/EY1 P/EY1/D EY1/D/#]` and {ipa_inline}`[#/S/P S/P/EY1 P/EY1/D EY1/D/#]`. At this level, we have made it so that the {ipa_inline}`[P]` phones have two different labels, so each can be modeled differently. + +However, representing phones this way results in a massive explosion of the number of phones, with not as many corresponding occurrences. If there is not much data for particular phones, modeling them appropriately becomes challenging. The solution to this data sparsity issue is to cluster the resulting states based on their similarity. For the triphone {ipa_inline}`[P/EY1/D]`, triphones like {ipa_inline}`[P/EY1/T]`, {ipa_inline}`[B/EY1/D]`,{ipa_inline}`[M/EY1/D]`,{ipa_inline}`[B/EY1/T]`, and {ipa_inline}`[M/EY1/T]` will all have similar acoustics, as they're {ipa_inline}`[EY1]` vowels with bilabial stops preceding and oral coronal stops following. The triphone {ipa_inline}`[P/EY1/N]` and others with following nasals will likely not be similar enough due to regressive nasalization in English. + +As a result of the phone clustering, the number of PDFs being modeled is reduced to a more manageable number with less data sparsity issues. + +```{note} +By default Kaldi and earlier versions MFA included silence phones with the nonsilence phones, due to the idea for instance that stops have a closure state to them and so that is similar to silence. However, having silence states be clustered with nonsilence states has led to gross alignment errors with less clean data, so MFA 3.1 and later removes all instances of the silence phone being clustered with nonsilence phones. The OOV phone is still clustered with both silence and nonsilence however, and OOVs can cover multiple words. +``` \ No newline at end of file diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py index 96fcc4df..8f2c4f77 100644 --- a/montreal_forced_aligner/abc.py +++ b/montreal_forced_aligner/abc.py @@ -699,7 +699,7 @@ def cleanup(self) -> None: logger.info(f"Done! Everything took {time.time() - self.start_time:.3f} seconds") if config.FINAL_CLEAN: logger.debug( - "Cleaning up temporary files, use the --debug flag to keep temporary files." + "Cleaning up temporary files, use the --no_final_clean flag to keep temporary files." ) if hasattr(self, "delete_database"): if config.USE_POSTGRES: diff --git a/montreal_forced_aligner/acoustic_modeling/monophone.py b/montreal_forced_aligner/acoustic_modeling/monophone.py index 06d68b72..b5aee09d 100644 --- a/montreal_forced_aligner/acoustic_modeling/monophone.py +++ b/montreal_forced_aligner/acoustic_modeling/monophone.py @@ -133,9 +133,10 @@ def _run(self): writer.Close() self.callback((accumulator.transition_accs, accumulator.gmm_accs)) train_logger.info(f"Done {num_done} utterances, errors on {num_error} utterances.") - train_logger.info( - f"Overall avg like per frame (Gaussian only) = {tot_like/tot_t} over {tot_t} frames." - ) + if tot_t: + train_logger.info( + f"Overall avg like per frame (Gaussian only) = {tot_like/tot_t} over {tot_t} frames." + ) class MonophoneTrainer(AcousticModelTrainingMixin): diff --git a/montreal_forced_aligner/acoustic_modeling/triphone.py b/montreal_forced_aligner/acoustic_modeling/triphone.py index fce20bfb..b0d6434f 100644 --- a/montreal_forced_aligner/acoustic_modeling/triphone.py +++ b/montreal_forced_aligner/acoustic_modeling/triphone.py @@ -411,11 +411,22 @@ def _setup_tree(self, init_from_previous=False, initial_mix_up=True) -> None: silence_sets = [ x for x in questions if silence_phone_id in x and x != [silence_phone_id] ] + filtered = [] + existing_sets = {tuple(x) for x in questions} for q_set in silence_sets: train_logger.debug(", ".join([self.reversed_phone_mapping[x] for x in q_set])) - questions = [ - x for x in questions if silence_phone_id not in x or x == [silence_phone_id] - ] + + for q_set in questions: + if silence_phone_id not in q_set or q_set == [silence_phone_id]: + filtered.append(q_set) + continue + q_set = [x for x in q_set if x != silence_phone_id] + if not q_set: + continue + if tuple(q_set) in existing_sets: + continue + filtered.append(q_set) + questions = filtered extra_questions = self.worker.extra_questions_mapping if extra_questions: diff --git a/montreal_forced_aligner/alignment/base.py b/montreal_forced_aligner/alignment/base.py index f060bc3f..0511f775 100644 --- a/montreal_forced_aligner/alignment/base.py +++ b/montreal_forced_aligner/alignment/base.py @@ -1449,6 +1449,7 @@ def evaluate_alignments( align_phones, silence_phone=self.optional_silence_phone, custom_mapping=mapping, + debug=config.DEBUG, ) unaligned_utts = [] utterances: typing.List[Utterance] = session.query(Utterance).options( diff --git a/montreal_forced_aligner/command_line/align.py b/montreal_forced_aligner/command_line/align.py index 2c564ef5..e84e1461 100644 --- a/montreal_forced_aligner/command_line/align.py +++ b/montreal_forced_aligner/command_line/align.py @@ -4,7 +4,6 @@ from pathlib import Path import rich_click as click -import yaml from montreal_forced_aligner import config from montreal_forced_aligner.alignment import PretrainedAligner @@ -15,7 +14,7 @@ validate_g2p_model, ) from montreal_forced_aligner.data import WorkflowType -from montreal_forced_aligner.helper import mfa_open +from montreal_forced_aligner.helper import load_evaluation_mapping __all__ = ["align_corpus_cli"] @@ -137,8 +136,7 @@ def align_corpus_cli(context, **kwargs) -> None: aligner.load_reference_alignments(reference_directory) mapping = None if custom_mapping_path: - with mfa_open(custom_mapping_path, "r") as f: - mapping = yaml.load(f, Loader=yaml.Loader) + mapping = load_evaluation_mapping(custom_mapping_path) aligner.validate_mapping(mapping) reference_alignments = WorkflowType.reference else: diff --git a/montreal_forced_aligner/command_line/validate.py b/montreal_forced_aligner/command_line/validate.py index b65fdc4e..690a9444 100644 --- a/montreal_forced_aligner/command_line/validate.py +++ b/montreal_forced_aligner/command_line/validate.py @@ -104,6 +104,7 @@ def validate_corpus_cli(context, **kwargs) -> None: """ if kwargs.get("profile", None) is not None: config.profile = kwargs.pop("profile") + config.FINAL_CLEAN = True config.update_configuration(kwargs) kwargs["USE_THREADING"] = False @@ -139,7 +140,6 @@ def validate_corpus_cli(context, **kwargs) -> None: validator.dirty = True raise finally: - config.FINAL_CLEAN = True validator.cleanup() diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py index 0044d449..a8ec6d9b 100644 --- a/montreal_forced_aligner/corpus/acoustic_corpus.py +++ b/montreal_forced_aligner/corpus/acoustic_corpus.py @@ -234,8 +234,12 @@ def load_reference_alignments(self, reference_directory: Path) -> None: max_id = p_id new_phones = [] for root, _, files in os.walk(reference_directory, followlinks=True): + if root.startswith("."): # Ignore hidden directories + continue root_speaker = os.path.basename(root) for f in files: + if f.startswith("."): # Ignore hidden files + continue if f.endswith(".TextGrid"): file_name = f.replace(".TextGrid", "") file_id = session.query(File.id).filter_by(name=file_name).scalar() @@ -988,6 +992,8 @@ def _load_corpus_from_source(self) -> None: for root, _, files in os.walk(self.audio_directory, followlinks=True): if self.stopped.is_set(): return + if root.startswith("."): # Ignore hidden directories + continue exts = find_exts(files) exts.wav_files = {k: os.path.join(root, v) for k, v in exts.wav_files.items()} exts.other_audio_files = { @@ -999,12 +1005,14 @@ def _load_corpus_from_source(self) -> None: with self.session() as session: import_data = DatabaseImportData() for root, _, files in os.walk(self.corpus_directory, followlinks=True): + if self.stopped.is_set(): + return + if root.startswith("."): # Ignore hidden directories + continue exts = find_exts(files) relative_path = ( root.replace(str(self.corpus_directory), "").lstrip("/").lstrip("\\") ) - if self.stopped.is_set(): - return if not use_audio_directory: all_sound_files = {} wav_files = {k: os.path.join(root, v) for k, v in exts.wav_files.items()} diff --git a/montreal_forced_aligner/corpus/helper.py b/montreal_forced_aligner/corpus/helper.py index 7ade00e4..1d819b07 100644 --- a/montreal_forced_aligner/corpus/helper.py +++ b/montreal_forced_aligner/corpus/helper.py @@ -62,6 +62,8 @@ def find_exts(files: typing.List[str]) -> FileExtensions: """ exts = FileExtensions(set(), {}, {}, {}, {}) for full_filename in files: + if full_filename.startswith("."): # Ignore hidden files + continue try: filename, fext = full_filename.rsplit(".", maxsplit=1) except ValueError: diff --git a/montreal_forced_aligner/corpus/multiprocessing.py b/montreal_forced_aligner/corpus/multiprocessing.py index 8905e34c..71049a63 100644 --- a/montreal_forced_aligner/corpus/multiprocessing.py +++ b/montreal_forced_aligner/corpus/multiprocessing.py @@ -106,6 +106,8 @@ def run(self) -> None: if self.audio_directory and os.path.exists(self.audio_directory): use_audio_directory = True for root, _, files in os.walk(self.audio_directory, followlinks=True): + if root.startswith("."): # Ignore hidden directories + continue exts = find_exts(files) wav_files = {k: os.path.join(root, v) for k, v in exts.wav_files.items()} other_audio_files = { @@ -114,11 +116,12 @@ def run(self) -> None: all_sound_files.update(other_audio_files) all_sound_files.update(wav_files) for root, _, files in os.walk(self.corpus_directory, followlinks=True): - exts = find_exts(files) - relative_path = root.replace(str(self.corpus_directory), "").lstrip("/").lstrip("\\") - if self.stopped.is_set(): break + if root.startswith("."): # Ignore hidden directories + continue + exts = find_exts(files) + relative_path = root.replace(str(self.corpus_directory), "").lstrip("/").lstrip("\\") if not use_audio_directory: all_sound_files = {} exts.wav_files = {k: os.path.join(root, v) for k, v in exts.wav_files.items()} diff --git a/montreal_forced_aligner/corpus/text_corpus.py b/montreal_forced_aligner/corpus/text_corpus.py index cea265ea..4e2c3654 100644 --- a/montreal_forced_aligner/corpus/text_corpus.py +++ b/montreal_forced_aligner/corpus/text_corpus.py @@ -65,13 +65,14 @@ def _load_corpus_from_source_mp(self) -> None: file_count = 0 with tqdm(total=1, disable=config.QUIET) as pbar, self.session() as session: for root, _, files in os.walk(self.corpus_directory, followlinks=True): + if self.stopped.is_set(): + break + if root.startswith("."): # Ignore hidden directories + continue exts = find_exts(files) relative_path = ( root.replace(str(self.corpus_directory), "").lstrip("/").lstrip("\\") ) - - if self.stopped.is_set(): - break for file_name in exts.identifiers: if self.stopped.is_set(): break @@ -181,12 +182,14 @@ def _load_corpus_from_source(self) -> None: sanitize_function = getattr(self, "sanitize_function", None) with self.session() as session: for root, _, files in os.walk(self.corpus_directory, followlinks=True): + if self.stopped: + return + if root.startswith("."): # Ignore hidden directories + continue exts = find_exts(files) relative_path = ( root.replace(str(self.corpus_directory), "").lstrip("/").lstrip("\\") ) - if self.stopped: - return for file_name in exts.identifiers: wav_path = None if file_name in exts.lab_files: diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py index 3d13bc1a..46743e4e 100644 --- a/montreal_forced_aligner/dictionary/mixins.py +++ b/montreal_forced_aligner/dictionary/mixins.py @@ -260,11 +260,7 @@ def get_base_phone(self, phone: str) -> str: @property def extra_questions_mapping(self) -> Dict[str, List[str]]: """Mapping of extra questions for the given phone set type""" - mapping = {"silence_question": []} - for p in sorted(self.silence_phones): - mapping["silence_question"].append(p) - if self.position_dependent_phones: - mapping["silence_question"].extend([p + x for x in self.positions]) + mapping = {} for k, v in self.phone_set_type.extra_questions.items(): if k not in mapping: mapping[k] = [] @@ -427,7 +423,8 @@ def _generate_positional_list(self, phones: Set[str]) -> List[str]: List of positional phones, sorted by base phone """ positional_phones = [] - phones |= {self.get_base_phone(p) for p in phones} + if not hasattr(self, "acoustic_model"): + phones |= {self.get_base_phone(p) for p in phones} for p in sorted(phones): if p not in self.non_silence_phones: continue diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py index 23b3b14b..075001c0 100644 --- a/montreal_forced_aligner/dictionary/multispeaker.py +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -152,7 +152,7 @@ def tokenizers(self): bracketed_word=self.bracketed_word, cutoff_word=self.cutoff_word, ignore_case=self.ignore_case, - use_g2p=self.use_g2p, + use_g2p=self.use_g2p or getattr(self, "g2p_model", None) is not None, clitic_set=clitic_set, grapheme_set=grapheme_set, ) diff --git a/montreal_forced_aligner/helper.py b/montreal_forced_aligner/helper.py index 8f44c4e2..6eae5109 100644 --- a/montreal_forced_aligner/helper.py +++ b/montreal_forced_aligner/helper.py @@ -51,6 +51,7 @@ "load_configuration", "format_correction", "format_probability", + "load_evaluation_mapping", ] @@ -160,7 +161,6 @@ def parse_old_features(config: MetaDict) -> MetaDict: del config["features"][key] for key, new_key in feature_key_remapping.items(): if key in config["features"]: - config["features"][new_key] = config["features"][key] del config["features"][key] else: @@ -448,7 +448,7 @@ def score_g2p(gold: List[str], hypo: List[str]) -> Tuple[int, int]: return 0, len(h) edits = 100000 best_length = 100000 - for (g, h) in itertools.product(gold, hypo): + for g, h in itertools.product(gold, hypo): e = edit_distance(g.split(), h.split()) if e < edits: edits = e @@ -613,6 +613,59 @@ def score_function(ref: str, pron: typing.List[str]): return transformed_pronunciations +def load_evaluation_mapping(custom_mapping_path): + with mfa_open(custom_mapping_path, "r") as f: + mapping = yaml.load(f, Loader=yaml.Loader) + for k, v in mapping.items(): + if isinstance(v, str): + mapping[k] = {v} + else: + mapping[k] = set(v) + return mapping + + +def fix_many_to_one_alignments(alignments, custom_mapping): + test_keys = set(x for x in custom_mapping.keys() if " " in x) + ref_keys = set() + for val in custom_mapping.values(): + ref_keys.update(x for x in val if " " in x) + new_ref = [] + new_test = [] + for a in alignments: + for i, sa in enumerate(a.seqA): + sb = a.seqB[i] + if i != 0: + prev_sa = a.seqA[i - 1] + prev_sb = a.seqB[i - 1] + ref_key = " ".join(x.label for x in [prev_sa, sa] if x != "-") + test_key = " ".join(x.label for x in [prev_sb, sb] if x != "-") + if ( + ref_key in ref_keys + and test_key in custom_mapping + and ref_key in custom_mapping[test_key] + ): + new_ref[-1].label = ref_key + new_ref[-1].end = sa.end + if sb != "-": + new_test.append(sb) + continue + if ( + test_key in test_keys + and test_key in custom_mapping + and ref_key in custom_mapping[test_key] + ): + new_test[-1].label = test_key + new_test[-1].end = sb.end + if sa != "-": + new_ref.append(sa) + continue + if sa != "-": + new_ref.append(sa) + if sb != "-": + new_test.append(sb) + return new_ref, new_test + + def align_phones( ref: List[CtmInterval], test: List[CtmInterval], @@ -633,6 +686,8 @@ def align_phones( List of CTM intervals to compare to reference silence_phone: str Silence phone (these are ignored in the final calculation) + ignored_phones: set[str], optional + Phones that should be ignored in score calculations (silence phone is automatically added) custom_mapping: dict[str, str], optional Mapping of phones to treat as matches even if they have different symbols debug: bool, optional @@ -650,6 +705,8 @@ def align_phones( if ignored_phones is None: ignored_phones = set() + if not isinstance(ignored_phones, set): + ignored_phones = set(ignored_phones) if custom_mapping is None: score_func = functools.partial(overlap_scoring, silence_phone=silence_phone) else: @@ -660,12 +717,18 @@ def align_phones( alignments = pairwise2.align.globalcs( ref, test, score_func, -2, -2, gap_char=["-"], one_alignment_only=True ) + if custom_mapping is not None: + ref, test = fix_many_to_one_alignments(alignments, custom_mapping) + alignments = pairwise2.align.globalcs( + ref, test, score_func, -2, -2, gap_char=["-"], one_alignment_only=True + ) overlap_count = 0 overlap_sum = 0 num_insertions = 0 num_deletions = 0 num_substitutions = 0 errors = collections.Counter() + ignored_phones.add(silence_phone) for a in alignments: for i, sa in enumerate(a.seqA): sb = a.seqB[i] @@ -689,16 +752,18 @@ def align_phones( if compare_labels(sa.label, sb.label, silence_phone, mapping=custom_mapping) > 0: num_substitutions += 1 errors[(sa.label, sb.label)] += 1 - if debug: - import logging - - logger = logging.getLogger("mfa") - logger.debug(pairwise2.format_alignment(*alignments[0])) if overlap_count: score = overlap_sum / overlap_count else: score = None phone_error_rate = (num_insertions + num_deletions + (2 * num_substitutions)) / len(ref) + if debug: + import logging + + logger = logging.getLogger("mfa") + logger.debug( + f"{pairwise2.format_alignment(*alignments[0])}\nScore: {score}\nPER: {phone_error_rate}\nErrors: {errors}" + ) return score, phone_error_rate, errors diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py index 087a7bf9..b8c5cba3 100644 --- a/montreal_forced_aligner/models.py +++ b/montreal_forced_aligner/models.py @@ -532,38 +532,38 @@ def lda_mat(self) -> FloatMatrix: def mfcc_options(self) -> MetaDict: """Parameters to use in computing MFCC features.""" return { - "sample_frequency": self._meta["features"].get("sample_frequency", 16000), - "frame_shift": self._meta["features"].get("frame_shift", 10), - "frame_length": self._meta["features"].get("frame_length", 25), - "dither": self._meta["features"].get("dither", 0.0001), - "preemphasis_coefficient": self._meta["features"].get("preemphasis_coefficient", 0.97), - "snip_edges": self._meta["features"].get("snip_edges", True), - "num_mel_bins": self._meta["features"].get("num_mel_bins", 23), - "low_frequency": self._meta["features"].get("low_frequency", 20), - "high_frequency": self._meta["features"].get("high_frequency", 7800), - "num_coefficients": self._meta["features"].get("num_coefficients", 13), - "use_energy": self._meta["features"].get("use_energy", False), - "energy_floor": self._meta["features"].get("energy_floor", 1.0), - "raw_energy": self._meta["features"].get("raw_energy", True), - "cepstral_lifter": self._meta["features"].get("cepstral_lifter", 22), + "sample_frequency": self.meta["features"].get("sample_frequency", 16000), + "frame_shift": self.meta["features"].get("frame_shift", 10), + "frame_length": self.meta["features"].get("frame_length", 25), + "dither": self.meta["features"].get("dither", 0.0001), + "preemphasis_coefficient": self.meta["features"].get("preemphasis_coefficient", 0.97), + "snip_edges": self.meta["features"].get("snip_edges", True), + "num_mel_bins": self.meta["features"].get("num_mel_bins", 23), + "low_frequency": self.meta["features"].get("low_frequency", 20), + "high_frequency": self.meta["features"].get("high_frequency", 7800), + "num_coefficients": self.meta["features"].get("num_coefficients", 13), + "use_energy": self.meta["features"].get("use_energy", False), + "energy_floor": self.meta["features"].get("energy_floor", 1.0), + "raw_energy": self.meta["features"].get("raw_energy", True), + "cepstral_lifter": self.meta["features"].get("cepstral_lifter", 22), } @property def pitch_options(self) -> MetaDict: """Parameters to use in computing MFCC features.""" - use_pitch = self._meta["features"].get("use_pitch", False) - use_voicing = self._meta["features"].get("use_voicing", False) - use_delta_pitch = self._meta["features"].get("use_delta_pitch", False) - normalize = self._meta["features"].get("normalize_pitch", True) + use_pitch = self.meta["features"].get("use_pitch", False) + use_voicing = self.meta["features"].get("use_voicing", False) + use_delta_pitch = self.meta["features"].get("use_delta_pitch", False) + normalize = self.meta["features"].get("normalize_pitch", True) options = { - "frame_shift": self._meta["features"].get("frame_shift", 10), - "frame_length": self._meta["features"].get("frame_length", 25), - "min_f0": self._meta["features"].get("min_f0", 50), - "max_f0": self._meta["features"].get("max_f0", 800), - "sample_frequency": self._meta["features"].get("sample_frequency", 16000), - "penalty_factor": self._meta["features"].get("penalty_factor", 0.1), - "delta_pitch": self._meta["features"].get("delta_pitch", 0.005), - "snip_edges": self._meta["features"].get("snip_edges", True), + "frame_shift": self.meta["features"].get("frame_shift", 10), + "frame_length": self.meta["features"].get("frame_length", 25), + "min_f0": self.meta["features"].get("min_f0", 50), + "max_f0": self.meta["features"].get("max_f0", 800), + "sample_frequency": self.meta["features"].get("sample_frequency", 16000), + "penalty_factor": self.meta["features"].get("penalty_factor", 0.1), + "delta_pitch": self.meta["features"].get("delta_pitch", 0.005), + "snip_edges": self.meta["features"].get("snip_edges", True), "add_normalized_log_pitch": False, "add_delta_pitch": False, "add_pov_feature": False, @@ -579,8 +579,8 @@ def pitch_options(self) -> MetaDict: def lda_options(self) -> MetaDict: """Parameters to use in computing MFCC features.""" return { - "splice_left_context": self._meta["features"].get("splice_left_context", 3), - "splice_right_context": self._meta["features"].get("splice_right_context", 3), + "splice_left_context": self.meta["features"].get("splice_left_context", 3), + "splice_right_context": self.meta["features"].get("splice_right_context", 3), } @property @@ -647,7 +647,7 @@ def meta(self) -> MetaDict: self._meta["other_noise_phone"] = "sp" if "phone_set_type" not in self._meta: self._meta["phone_set_type"] = "UNKNOWN" - if "language" not in self._meta or self._meta["version"] <= "3.0": + if "language" not in self._meta or self._meta["version"] < "3.0": self._meta["language"] = "unknown" self._meta["phones"] = set(self._meta.get("phones", [])) if ( diff --git a/montreal_forced_aligner/tokenization/simple.py b/montreal_forced_aligner/tokenization/simple.py index 74abd9a6..95a880d1 100644 --- a/montreal_forced_aligner/tokenization/simple.py +++ b/montreal_forced_aligner/tokenization/simple.py @@ -133,6 +133,7 @@ def __init__( non_speech_regexes: typing.Dict[str, re.Pattern], oov_word: typing.Optional[str] = None, grapheme_set: typing.Optional[typing.Collection[str]] = None, + always_split_compounds: bool = False, ): self.word_table = word_table self.clitic_marker = clitic_marker @@ -154,6 +155,7 @@ def __init__( self.has_initial = True if self.final_clitic_regex is not None: self.has_final = True + self.always_split_compounds = always_split_compounds def to_str(self, normalized_text: str) -> str: """ @@ -198,16 +200,17 @@ def split_clitics( List of subwords """ split = [] + benefit = False if self.compound_regex is not None: s = [x for x in self.compound_regex.split(item) if x] - + if self.always_split_compounds and len(s) > 1: + benefit = True else: s = [item] if self.word_table is None: return [item] clean_initial_quote_regex = re.compile("^'") clean_final_quote_regex = re.compile("'$") - benefit = False for seg in s: if not seg: continue @@ -386,6 +389,7 @@ def __init__( self.non_speech_regexes, self.oov_word, self.grapheme_set, + always_split_compounds=self.use_g2p, ) def _compile_regexes(self) -> None: diff --git a/montreal_forced_aligner/vad/multiprocessing.py b/montreal_forced_aligner/vad/multiprocessing.py index c1a3db9d..1c2d3a5c 100644 --- a/montreal_forced_aligner/vad/multiprocessing.py +++ b/montreal_forced_aligner/vad/multiprocessing.py @@ -187,7 +187,8 @@ def segment_utterance_transcript( ) else: segments = segment_utterance_vad_speech_brain(utterance, vad_model, segmentation_options) - + if not segments: + return [utterance] config = LatticeFasterDecoderConfig() config.beam = beam config.lattice_beam = lattice_beam diff --git a/tests/test_helper.py b/tests/test_helper.py index 8292fb3d..2355dd07 100644 --- a/tests/test_helper.py +++ b/tests/test_helper.py @@ -1,12 +1,9 @@ -import yaml - from montreal_forced_aligner.data import CtmInterval -from montreal_forced_aligner.helper import align_phones, mfa_open +from montreal_forced_aligner.helper import align_phones, load_evaluation_mapping def test_align_phones(basic_corpus_dir, basic_dict_path, temp_dir, eval_mapping_path): - with mfa_open(eval_mapping_path) as f: - mapping = yaml.safe_load(f) + mapping = load_evaluation_mapping(eval_mapping_path) reference_phoneset = set() for v in mapping.values(): if isinstance(v, str):