diff --git a/src/babelon/cli.py b/src/babelon/cli.py index 06c5166..c7f8d27 100644 --- a/src/babelon/cli.py +++ b/src/babelon/cli.py @@ -33,6 +33,12 @@ type=click.File(mode="w"), default=sys.stdout, ) +sort_table_option = click.option( + "--sort-tables", + type=bool, + default=True, + help="If true, all output tables are sorted before written.", +) output_format_option = click.option( "--output-format", "-t", @@ -153,12 +159,7 @@ def translate(input, model, language_code, update_existing, output): default=True, help="If true, the translation status is changed to CANDIDATE if a source value has changed.", ) -@click.option( - "--sort-tables", - type=bool, - default=True, - help="If true, all output tables are sorted before written.", -) +@sort_table_option @output_option def prepare_translation( input, @@ -228,8 +229,9 @@ def statistics_translation_profile_command( @click.command("merge") @multiple_inputs_argument +@sort_table_option @output_option -def merge(inputs, output): +def merge(inputs, sort_tables, output): """Merge multiple babelon files into one.""" df = pd.read_csv(inputs[0], sep="\t") @@ -238,6 +240,9 @@ def merge(inputs, output): df_temp = pd.read_csv(input_file, sep="\t") df = pd.concat([df, df_temp], axis=0, ignore_index=True) + if sort_tables: + df = sort_babelon_tsv(df) + if output: df.to_csv(output, sep="\t", index=False) else: diff --git a/src/babelon/translate.py b/src/babelon/translate.py index a01b0ad..ed92685 100644 --- a/src/babelon/translate.py +++ b/src/babelon/translate.py @@ -205,6 +205,16 @@ def prepare_translation_for_ontology( source_value = row["source_value"] translation_status = row["translation_status"] term_metadata = _get_metadata_for_term(ontology, subject_id) + if translation_status == "NOT_TRANSLATED": + if not include_not_translated: + mark_index_for_removal.append(index) + if predicate_id in term_metadata: + output_not_translated_data.append(row.to_dict()) + else: + logging.warning( + f"{predicate_id} value for {subject_id} is marked as NOT_TRANSLATED," + f"but does not exist at all in the ontology. Omitting row." + ) if predicate_id in term_metadata: ontology_value = term_metadata[predicate_id][0] if len(term_metadata[predicate_id]) > 1: @@ -212,10 +222,6 @@ def prepare_translation_for_ontology( f"{predicate_id} value for {subject_id} is ambiguous," f"picking first one ({term_metadata[predicate_id]})." ) - if translation_status == "NOT_TRANSLATED": - output_not_translated_data.append(row.to_dict()) - if not include_not_translated: - mark_index_for_removal.append(index) if ontology_value != source_value: translation_value = row["translation_value"] df_augmented.at[index, "source_value"] = ontology_value