Merge pull request #43 from monarch-initiative/fix-not-translated

Fix NOT_TRANSLATED treatment in prepare table and add default sorting to merge()
monarch-initiative · Mar 1, 2024 · a9ae364 · a9ae364
2 parents 739d572 + b787f09
commit a9ae364
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 11 deletions.
diff --git a/src/babelon/cli.py b/src/babelon/cli.py
@@ -33,6 +33,12 @@
     type=click.File(mode="w"),
     default=sys.stdout,
 )
+sort_table_option = click.option(
+    "--sort-tables",
+    type=bool,
+    default=True,
+    help="If true, all output tables are sorted before written.",
+)
 output_format_option = click.option(
     "--output-format",
     "-t",
@@ -153,12 +159,7 @@ def translate(input, model, language_code, update_existing, output):
     default=True,
     help="If true, the translation status is changed to CANDIDATE if a source value has changed.",
 )
-@click.option(
-    "--sort-tables",
-    type=bool,
-    default=True,
-    help="If true, all output tables are sorted before written.",
-)
+@sort_table_option
 @output_option
 def prepare_translation(
     input,
@@ -228,8 +229,9 @@ def statistics_translation_profile_command(
 
 @click.command("merge")
 @multiple_inputs_argument
+@sort_table_option
 @output_option
-def merge(inputs, output):
+def merge(inputs, sort_tables, output):
     """Merge multiple babelon files into one."""
     df = pd.read_csv(inputs[0], sep="\t")
 
@@ -238,6 +240,9 @@ def merge(inputs, output):
         df_temp = pd.read_csv(input_file, sep="\t")
         df = pd.concat([df, df_temp], axis=0, ignore_index=True)
 
+    if sort_tables:
+        df = sort_babelon_tsv(df)
+
     if output:
         df.to_csv(output, sep="\t", index=False)
     else:

diff --git a/src/babelon/translate.py b/src/babelon/translate.py
@@ -205,17 +205,23 @@ def prepare_translation_for_ontology(
         source_value = row["source_value"]
         translation_status = row["translation_status"]
         term_metadata = _get_metadata_for_term(ontology, subject_id)
+        if translation_status == "NOT_TRANSLATED":
+            if not include_not_translated:
+                mark_index_for_removal.append(index)
+            if predicate_id in term_metadata:
+                output_not_translated_data.append(row.to_dict())
+            else:
+                logging.warning(
+                    f"{predicate_id} value for {subject_id} is marked as NOT_TRANSLATED,"
+                    f"but does not exist at all in the ontology. Omitting row."
+                )
         if predicate_id in term_metadata:
             ontology_value = term_metadata[predicate_id][0]
             if len(term_metadata[predicate_id]) > 1:
                 logging.warning(
                     f"{predicate_id} value for {subject_id} is ambiguous,"
                     f"picking first one ({term_metadata[predicate_id]})."
                 )
-            if translation_status == "NOT_TRANSLATED":
-                output_not_translated_data.append(row.to_dict())
-                if not include_not_translated:
-                    mark_index_for_removal.append(index)
             if ontology_value != source_value:
                 translation_value = row["translation_value"]
                 df_augmented.at[index, "source_value"] = ontology_value