From ad0f15cbdbdf80a1cd71e3e241f6c8c74ee00c0d Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Thu, 28 Nov 2024 12:31:57 +1300
Subject: [PATCH] WIP: prototype separate workflows as entrypoints

See <https://bedfordlab.slack.com/archives/C01LCTT7JNN/p1732568407123369>
for context.

Able to be run via a number of different ways:

- From the 'avian-flu' repo:
    - `snakemake -s gisaid/Snakefile ...`
    - `cd gisaid && snakemake ...`
    - `snakemake --configfile snakemake/config.yaml`
- From a separate analysis directory, where ${AVIAN_FLU} is the path
  to the (locally checked out) avian-flu repo
    - without any config overlays: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile`
    - with a `config.yaml` overlay: (same as above)
    - with a `foo.yaml` overlay: `snakemake -s ${AVIAN_FLU}/gisaid/Snakefile --configfile foo.yaml`
---
 Snakefile                                     | 102 +++++++++---------
 gisaid/Snakefile                              |   4 +
 config/gisaid.yaml => gisaid/config.yaml      |   0
 h5n1-cattle-outbreak/Snakefile                |   4 +
 .../config.yaml                               |   1 +
 5 files changed, 58 insertions(+), 53 deletions(-)
 create mode 100644 gisaid/Snakefile
 rename config/gisaid.yaml => gisaid/config.yaml (100%)
 create mode 100644 h5n1-cattle-outbreak/Snakefile
 rename config/h5n1-cattle-outbreak.yaml => h5n1-cattle-outbreak/config.yaml (99%)

diff --git a/Snakefile b/Snakefile
index d7c987a..2c28e52 100755
--- a/Snakefile
+++ b/Snakefile
@@ -8,59 +8,34 @@ wildcard_constraints:
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
 #SUBTYPES = ["h5n1", "h5nx", "h7n9", "h9n2"]
 
-# ----------------------------------------------------------------------------
-# Allow this to work from a separate workdir by using a config in that workdir
-# which extends one of our base configs
-# ----------------------------------------------------------------------------
-if os.path.exists("config.yaml"):
-    configfile: "config.yaml"
-    # See commentary below
-    # print("This doesn't work as expected! See commentary in Snakefile", file=sys.stderr)
-    # exit(2)
-
-if config.get('extends', False):
-    extend_path = os.path.join(workflow.basedir, "config", config['extends'])
-    if not os.path.isfile(extend_path):
-        sys.exit(f"Your config tried to extend {config['extends']!r} but this doesn't exist. It must be relative to {os.path.join(workflow.basedir, 'config')}")
-    configfile: extend_path
-
-# NOTE:
-# In the situation where we're running outside of the repo, and we have a custom config YAML
-# such as `foo.yaml`:
-#        extends: h5n1-cattle-outbreak.yaml
-#        segments: ['pb2']
-# If we run with `--configfile foo.yaml` then the merging behaviour is strange (to me!)
-# We've clearly parsed the --configfile, as we have config['extends']="h5n1-cattle-outbreak.yaml",
-# and we do merge in all the config values of `h5n1-cattle-outbreak.yaml` (via the above code)
-# so I expected we'd therefore have config['segments']=['genome', 'pb2', 'pb1', ...]
-# as defined in 'h5n1-cattle-outbreak.yaml', however we end up with only config['segments']=['pb2'].
-# So it seems like the `--configfile` definitions are being re-applied a second time?!?
-#
-# This _is not the case_ when we use the `os.path.exists("config.yaml")` approach,
-# which is why it's not going to work without the following additional update_config
-# step (or something else?)
+CURRENT_BASEDIR = workflow.current_basedir # TODO XXX store this value here - can't access within functions because workflow.included_stack is empty
 
+# Load the base config.yaml relative to the entry snakefile (i.e. not this snakefile)
+if os.path.exists(os.path.join(workflow.basedir, 'config.yaml')):
+    print('LOADING BASE CONFIG')
+    configfile: os.path.join(workflow.basedir, 'config.yaml')
+
+# load a config.yaml file if it exists in the current working directory
 if os.path.exists("config.yaml"):
-    # Following <https://github.com/snakemake/snakemake/blob/76d53290a003891c5ee41f81e8eb4821c406255d/snakemake/common/configfile.py#L7-L33>
-    import yte
-    with open("config.yaml", encoding='utf-8') as f:
-        overwrite_config = yte.process_yaml(f, require_use_yte=True)
-    snakemake.utils.update_config(config, overwrite_config)
+    configfile: "config.yaml"
 
 from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove
 
+class InvalidConfigError(Exception):
+    pass
 
 def resolve_config_path(original_path, wildcards=None):
     """
-    Resolve a relative *path* given in a configuration value.
-    Resolves *path* as relative to the workflow's ``config/`` directory (i.e.
-    ``os.path.join(workflow.basedir, "config", path)``) if it doesn't exist
-    in the workflow's analysis directory (i.e. the current working
-    directory, or workdir, usually given by ``--directory`` (``-d``)).
-    This behaviour allows a default configuration value to point to a default
-    auxiliary file while also letting the file used be overridden either by
-    setting an alternate file path in the configuration or by creating a file
-    with the conventional name in the workflow's analysis directory.
+    Resolve a relative *path* given in a configuration value. Before resolving
+    any '{x}' substrings are replaced by their corresponding wildcards (if the
+    `wildcards` argument is provided).
+    
+    Search order (first match returned):
+    1. Relative to the analysis directory
+    2. Relative to the directory the entry snakefile was in. Typically this
+       is not the Snakefile you are looking at now but (e.g.) the one in
+       avian-flu/gisaid
+    3. Relative to where this Snakefile is (i.e. `avian-flu/`)
     """
     path = original_path.format(**wildcards) if wildcards else original_path
 
@@ -71,17 +46,30 @@ def resolve_config_path(original_path, wildcards=None):
             print(f"The call to `resolve_config_path({original_path!r})` includes unresolved wildcards - please include the wildcards as the second argument to `resolve_config_path`.", file=sys.stderr)
         exit(2)
 
-    if not os.path.exists(path):
-        # Check if the path exists relative to the basedir. This catches things like "config/…"
-        # as well as "clade-labeling/h5n1-clades.tsv"
-        basepath = os.path.join(workflow.basedir, path)
+    if os.path.exists(path): # isfile?
+        return path
+
+    # Check if the path exists relative to the subdir where the entry snakefile is
+    # (e.g. avian-flu/gisaid). If you want to use further subdirectories (e.g. avian-flu/gisaid/config/x.tsv)
+    # you're expected to supply the 'config/x.tsv' as the value in the config YAML
+    # NOTE: this means analysis directory overrides have to use that same 'config/x.tsv' structure, but
+    # given the different directories avian-flu uses that's acceptable. In other words, if we standardised
+    # avian-flu then we could add subdirectories to the search order here
+    basepath = os.path.join(workflow.basedir, path)
+    if os.path.exists(basepath):
+        return basepath
+
+    # Check if the path exists relative to where _this_ snakefile is, i.e. relative to `avian-flu/`.
+    if workflow.basedir != CURRENT_BASEDIR:
+        basepath = os.path.join(CURRENT_BASEDIR, path)
         if os.path.exists(basepath):
             return basepath
 
-        print(f"Unable to resolve the path {path!r} either within the working directory or within {workflow.basedir!r}", file=sys.stderr)
-        exit(2)
-
-    return path
+    raise InvalidConfigError(f"Unable to resolve the config-provided path {original_path!r}, expanded to {path!r} after filling in wildcards. "
+        f"The following directories were searched:\n"
+        f"\t1. {os.path.abspath(os.curdir)} (current working directory)\n"
+        f"\t2. {workflow.basedir} (where the entry snakefile is)\n"
+        f"\t3. {CURRENT_BASEDIR} (where the main avian-flu snakefile is)\n")
 
 
 # The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument)
@@ -95,6 +83,14 @@ S3_SRC = config.get('s3_src', {})
 LOCAL_INGEST = config.get('local_ingest', None)
 
 def sanity_check_config():
+    if not len(config.keys()):
+        print("-"*80 + "\nNo config loaded!", file=sys.stderr)
+        print("Avian-flu is indented to be run from the snakefile inside a subdir " 
+            "(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. " 
+            "Alternatively you can pass in the config via `--configfile`", file=sys.stderr)
+        print("-"*80, file=sys.stderr)
+        raise InvalidConfigError("No config")
+
     assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'"
     # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST
     # if we want to use `--config local_ingest=gisaid` overrides.
diff --git a/gisaid/Snakefile b/gisaid/Snakefile
new file mode 100644
index 0000000..d5db20f
--- /dev/null
+++ b/gisaid/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
\ No newline at end of file
diff --git a/config/gisaid.yaml b/gisaid/config.yaml
similarity index 100%
rename from config/gisaid.yaml
rename to gisaid/config.yaml
diff --git a/h5n1-cattle-outbreak/Snakefile b/h5n1-cattle-outbreak/Snakefile
new file mode 100644
index 0000000..d5db20f
--- /dev/null
+++ b/h5n1-cattle-outbreak/Snakefile
@@ -0,0 +1,4 @@
+include: "../Snakefile"
+
+rule _all:
+    input: rules.all.input
\ No newline at end of file
diff --git a/config/h5n1-cattle-outbreak.yaml b/h5n1-cattle-outbreak/config.yaml
similarity index 99%
rename from config/h5n1-cattle-outbreak.yaml
rename to h5n1-cattle-outbreak/config.yaml
index ba78421..363c09d 100644
--- a/config/h5n1-cattle-outbreak.yaml
+++ b/h5n1-cattle-outbreak/config.yaml
@@ -36,6 +36,7 @@ target_sequences_per_tree: 10_000
 
 
 #### Config files ####
+
 reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
 genome_reference: config/{subtype}/h5_cattle_genome_root.gb
 auspice_config: config/{subtype}/auspice_config_{subtype}.json