Merge pull request #375 from vanheeringen-lab/develop

Develop -> master
vanheeringen-lab · Jun 9, 2020 · d4b79ab · d4b79ab
2 parents 6244ad7 + cbaa457
commit d4b79ab
Show file tree

Hide file tree

Showing 48 changed files with 544 additions and 501 deletions.
diff --git a/.github/workflows/continuousdeployment.yml b/.github/workflows/continuousdeployment.yml
@@ -0,0 +1,33 @@
+name: continuous-deployment
+
+# on push events tagged with a version number
+on:
+  push:
+    tags:
+      - 'v*' #
+
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - uses: actions/setup-python@v1
+      with:
+        python-version: 3.6
+
+    - name: Create Release
+      uses: actions/create-release@latest
+      env:
+        GITHUB_TOKEN: ${{ secrets.AccessToken }}
+      with:
+        tag_name: ${{ github.ref }}
+        release_name: Release ${{ github.ref }}
+        body: |
+          Changes in this Release:
+          - First Change
+          - Second Change
+          Probably link the CHANGELOG
+        draft: false
+        prerelease: false
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -28,6 +28,7 @@ jobs:
           python docs/scripts/schemas.py
           python docs/scripts/rule_description.py
           ./docs/scripts/gen_dags
+          cp CHANGELOG.md docs/content
           sphinx-build docs/ build
           touch build/.nojekyll
 

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 *.Rhistory
 slurm*.out
 report.html
+build/
+*.egg*
 .idea/
 .snakemake/
 tinydata/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,8 @@
+# Changelog
+All notable changes to `seq2science` will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+All changed fall under either one of these types: `added`, `changed`, `deprecated`, `removed`, `fixed`, `security`.
+
+## [Unreleased]
diff --git a/bin/seq2science b/bin/seq2science
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 """
-This is the user's entry-point for the seq2science pipeline.
+This is the user's entry-point for the seq2science tool.
 """
 import sys
 import argparse
@@ -23,6 +23,7 @@ except ImportError:
 
 __version__ = "0.0.0"
 
+
 def main():
     # set helpful paths
     base_dir = os.path.dirname(inspect.getfile(seq2science))
@@ -36,7 +37,7 @@ def main():
         dir_path = args.dir if os.path.isabs(args.dir) else os.path.join(os.getcwd(), args.dir)
         _init(args, workflows_dir, dir_path)
     elif args.command == "run":
-        config_path = args.config if os.path.isabs(args.config) else os.path.join(os.getcwd(), args.config)
+        config_path = args.configfile if os.path.isabs(args.configfile) else os.path.join(os.getcwd(), args.configfile)
         _run(args, base_dir, workflows_dir, config_path)
     elif args.command == "clean":
         _clean(base_dir)
@@ -97,7 +98,7 @@ def seq2science_parser(workflows_dir="./seq2science/workflows/"):
 
     run.add_argument(
         "-c",
-        "--config",
+        "--configfile",
         default="./config.yaml",
         metavar="FILE",
         help="The path to the config file.",
@@ -178,25 +179,25 @@ def _run(args, base_dir, workflows_dir, config_path):
 
     # parse the args
     parsed_args = {"snakefile": os.path.join(workflows_dir, args.workflow, "Snakefile"),
-                   "config": {"rule_dir": os.path.join(base_dir, "rules")},
                    "cores": args.cores,
                    "use_conda": True,
+                   "conda_frontend": "mamba",
                    "conda_prefix": os.path.join(base_dir, ".snakemake"),
                    "dryrun": args.dryrun}
 
     # get the additional snakemake options
     snakemake_options = args.snakemakeOptions if args.snakemakeOptions is not None else dict()
-
+    snakemake_options.setdefault("config", {}).update({"rule_dir": os.path.join(base_dir, "rules")})
     # parse the profile
     snakemake_options["configfiles"] = [config_path]
     if args.profile is not None:
-        config_file = snakemake.get_profile_file(args.profile, "config.yaml")
-        if config_file is None:
+        profile_file = snakemake.get_profile_file(args.profile, "config.yaml")
+        if profile_file is None:
             print("Error: profile given but no config.yaml found.")
             sys.exit(1)
-        parsed_args["configfiles"] += [config_file]
-        profile = yaml.safe_load(open(config_file).read())
-        if "cores" in profile:
+        snakemake_options["configfiles"] += [profile_file]
+        profile = yaml.safe_load(open(profile_file).read())
+        if "cores" in profile and parsed_args["cores"] is None:
             parsed_args["cores"] = profile["cores"]
 
     parsed_args.update(snakemake_options)
@@ -208,7 +209,7 @@ def _run(args, base_dir, workflows_dir, config_path):
 
     # run snakemake
     exit_code = snakemake.snakemake(**parsed_args)
-    sys.exit(exit_code)
+    sys.exit(0) if exit_code else sys.exit(1)
 
 
 def _clean(base_dir):
@@ -247,18 +248,26 @@ class _StoreDictKeyPair(argparse.Action):
         )
 
     def __call__(self, parser, namespace, values, option_string=None):
+        # TODO: cleanup
         my_dict = {}
         for kv in values:
             k, v = kv.split("=")
-            if v[0] == "{" and v[-1] == "}":
-                pairs = list(filter(None, re.split('{|:| |}', v)))
-                assert len(pairs) % 2 == 0
-                v = {pairs[i]: pairs[i+1] for i in range(0, len(pairs), 2)}
-                v = {k: int(v) if v.isdigit() else v for k, v in v.items()}
+
+            if ":" in v:
+                pair = list(filter(None, re.split('{|:| |}', v)))
+                assert len(pair) == 2
+                if pair[1].lower() == 'true':
+                    pair[1] = True
+                v = {pair[0]: int(pair[1]) if isinstance(pair[1], str) and pair[1].isdigit() else pair[1]}
+            elif "[" in v:
+                v = re.sub("\[|\]", "", v).split(",")
             try:
                 my_dict[k] = int(v)
             except:
-                my_dict[k] = v
+                if k not in my_dict:
+                    my_dict[k] = v
+                else:
+                    my_dict[k].update(v)
 
         setattr(namespace, self.dest, my_dict)
 

diff --git a/docs/content/extensive_docs.rst b/docs/content/extensive_docs.rst
@@ -11,3 +11,4 @@ The extensive docs serve to have all steps and configurable options documented.
    cli.rst
    all_rules.md
    schemas.md
+   CHANGELOG.md
diff --git a/requirements.yaml b/requirements.yaml
@@ -6,7 +6,6 @@ channels:
 dependencies:
   - pkgs/main::python=3.6
   - bioconda::snakemake>=5.18
-  - pip=20.0.2
   - bioconda::sra-tools=2.9.1
   - bioconda::entrez-direct=11.0
   - bioconda::pysam=0.15.3
@@ -16,5 +15,7 @@ dependencies:
   - bioconda::norns=0.1.5
   - anaconda::biopython=1.74
   - pkgs/main::filelock=3.0.12
-  - pip:
-    - git+https://github.com/daler/trackhub@params-overhaul
+  - pkgs/mean::pyyaml
+  - pkgs/main::beautifulsoup4=4.9.0
+  - conda-forge:pretty_html_table=0.9.dev0
+  - bioconda::trackhub=0.1.2019.12.24
diff --git a/seq2science/envs/macs2.yaml b/seq2science/envs/macs2.yaml
@@ -4,5 +4,6 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - bioconda::macs2=2.2.4
+  - bioconda::macs2=2.2.7
   - bioconda::khmer=2.0
+  - pkgs/main::setuptools=47.1.1  # missing dependency of macs2's unique-kmers.py
diff --git a/seq2science/envs/qc.yaml b/seq2science/envs/qc.yaml
@@ -5,4 +5,4 @@ channels:
   - defaults
 dependencies:
   - bioconda::fastqc=0.11.8
-  - bioconda::multiqc=1.8
+  - bioconda::multiqc=1.9
diff --git a/seq2science/rules/DGE_analysis.smk b/seq2science/rules/DGE_analysis.smk
@@ -67,7 +67,7 @@ rule deseq2:
     resources:
         R_scripts=1 # conda's R can have issues when starting multiple times
     script:
-        "../scripts/deseq2.R"
+        f"{config['rule_dir']}/../scripts/deseq2.R"
 
 
 rule blind_clustering:
@@ -91,4 +91,4 @@ rule blind_clustering:
     resources:
         R_scripts=1 # conda's R can have issues when starting multiple times
     script:
-        "../scripts/deseq2_clustering.R"
+        f"{config['rule_dir']}/../scripts/deseq2_clustering.R"
diff --git a/seq2science/rules/alignment.smk b/seq2science/rules/alignment.smk
@@ -182,7 +182,7 @@ elif config['aligner'] == 'star' or config.get('quantifier', '') == 'star':
         input:
             genome = expand("{genome_dir}/{{assembly}}/{{assembly}}.fa", **config),
             sizefile= expand("{genome_dir}/{{assembly}}/{{assembly}}.fa.sizes", **config),
-            gtf = expand("{genome_dir}/{{assembly}}/{{assembly}}.gtf", **config)
+            gtf = expand("{genome_dir}/{{assembly}}/{{assembly}}.annotation.gtf", **config)
         output:
             directory(expand("{genome_dir}/{{assembly}}/index/{aligner}", **config))
         log:

diff --git a/seq2science/rules/bam_cleaning.smk b/seq2science/rules/bam_cleaning.smk
@@ -1,6 +1,3 @@
-import gzip
-
-
 def get_blacklist_files(wildcards):
     files = {}
     # ideally get genome is a checkpoint, however there are quite some Snakemake
@@ -30,9 +27,9 @@ rule setup_blacklist:
         newblacklist = ""
         if config.get('remove_blacklist') and wildcards.assembly.lower() in \
                 ["ce10", "dm3", "hg38", "hg19", "mm9", "mm10"]:
-            blacklist = f"{config['genome_dir']}/{wildcards.assembly}/{wildcards.assembly}.blacklist.bed.gz"
-            with gzip.GzipFile(blacklist) as file:
-                newblacklist += file.read().decode('utf8')
+            blacklist = f"{config['genome_dir']}/{wildcards.assembly}/{wildcards.assembly}.blacklist.bed"
+            with open(blacklist) as file:
+                newblacklist += file.read()
 
         if any('.fa.sizes' in inputfile for inputfile in input):
             with open(input.sizes, 'r') as file:
@@ -62,7 +59,7 @@ rule complement_blacklist:
     shell:
         """
         sortBed -faidx {input.sizes} -i {input.blacklist} |
-        complementBed -i /dev/stdin -g {input.sizes} > {output} 2> {log}
+        complementBed -i stdin -g {input.sizes} > {output} 2> {log}
         """
 
 
@@ -88,7 +85,7 @@ rule sieve_bam:
         expand("{benchmark_dir}/sieve_bam/{{assembly}}-{{sample}}.benchmark.txt", **config)[0]
     params:
         minqual=f"-q {config['min_mapping_quality']}",
-        atacshift=lambda wildcards, input: f"| ../../scripts/atacshift.pl /dev/stdin {input.sizes}" if config['tn5_shift'] else "",
+        atacshift=lambda wildcards, input: f"| {config['rule_dir']}/../scripts/atacshift.pl /dev/stdin {input.sizes}" if config['tn5_shift'] else "",
         blacklist=lambda wildcards, input: f"-L {input.blacklist}",
         prim_align=f"-F 256" if config["only_primary_align"] else ""
     conda:

diff --git a/seq2science/rules/bigfiles.smk b/seq2science/rules/bigfiles.smk
@@ -93,11 +93,11 @@ def get_bigpeak_type(wildcards):
 
 def get_bigpeak_schema(wildcards):
     if get_ftype(wildcards.peak_caller) == "narrowPeak":
-        return "../../schemas/bignarrowPeak.as"
+        return f"{config['rule_dir']}/../schemas/bignarrowPeak.as"
     if get_ftype(wildcards.peak_caller) == "broadPeak":
         if len(treps_from_brep[(wildcards.sample, wildcards.assembly)]) == 1:
-            return "../../schemas/bigbroadPeak.as"
-        return "../../schemas/bigBed.as"
+            return f"{config['rule_dir']}/../schemas/bigbroadPeak.as"
+        return f"{config['rule_dir']}/../schemas/bigBed.as"
     raise NotImplementedError()