sanger-bentley-group · HarryHung · Nov 17, 2023 · Jun 21, 2023 · Jun 21, 2023 · Jun 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ work
 databases
 input
 output
+test_input
 *.html
 
 # Singularity cache

diff --git a/README.md b/README.md
diff --git a/bin/assembly_qc.sh b/bin/assembly_qc.sh
diff --git a/bin/call_snp.sh b/bin/call_snp.sh
@@ -0,0 +1,8 @@
+# Call SNPs and save to .vcf
+# Remove source sorted BAM file if $LITE is true
+
+bcftools mpileup --threads "$(nproc)" -f "$REFERENCE" "$SORTED_BAM" | bcftools call --threads "$(nproc)" -mv -O v -o "$VCF"
+
+if [ "$LITE" = true ]; then
+    rm "$(readlink -f "$SORTED_BAM")"
+fi
diff --git a/bin/check-create_ariba_db.sh b/bin/check-create_ariba_db.sh
@@ -0,0 +1,33 @@
+# Check if ARIBA database was prepared from the specific reference sequences and metadata.
+# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON
+
+REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }')
+METADATA_MD5=$(md5sum "$METADATA" | awk '{ print $1 }')
+
+if  [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
+    [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \
+    [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \
+    [ ! "$(grep '"metadata"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \
+    [ ! "$(grep '"metadata_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/00.info.txt" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/00.version_info.txt" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa" ] || \
+    [ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa" ] ; then
+
+    rm -rf "${DB_LOCAL}"
+
+    mkdir -p "${DB_LOCAL}"
+    ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "${DB_LOCAL}/${OUTPUT}"
+
+    echo -e "{\n  \"reference\": \"$REF_SEQUENCES\",\n  \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n  \"metadata\": \"$METADATA\",\n  \"metadata_md5\": \"$METADATA_MD5\",\n  \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}"
+
+fi
diff --git a/bin/check-create_ref_genome_bwa_db.sh b/bin/check-create_ref_genome_bwa_db.sh
@@ -0,0 +1,24 @@
+# Check if BWA database was prepared from the specific reference.
+# If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON
+
+REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }')
+
+if  [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
+    [ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \
+    [ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \
+    [ ! -f "${DB_LOCAL}/${PREFIX}.amb" ] || \
+    [ ! -f "${DB_LOCAL}/${PREFIX}.ann" ] || \
+    [ ! -f "${DB_LOCAL}/${PREFIX}.bwt" ] || \
+    [ ! -f "${DB_LOCAL}/${PREFIX}.pac" ] || \
+    [ ! -f "${DB_LOCAL}/${PREFIX}.sa" ] ; then
+
+    rm -rf "${DB_LOCAL}"
+
+    bwa index -p "$PREFIX" "$REFERENCE"
+
+    mkdir -p "${DB_LOCAL}"
+    mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "${DB_LOCAL}"
+
+    echo -e "{\n  \"reference\": \"$REFERENCE\",\n  \"reference_md5\": \"$REFERENCE_MD5\",\n  \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}"
+
+fi
diff --git a/bin/check-create_seroba_db.sh b/bin/check-create_seroba_db.sh
@@ -0,0 +1,34 @@
+# Check if database was downloaded from specific link, also prepared by the specific Kmer
+# If not: remove files in database directory and download, re-create KMC and ARIBA databases, also save metadata to JSON
+
+ZIPPED_REPO='seroba.tar.gz'
+
+if  [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
+    [ ! "$(grep '"url"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$DB_REMOTE" ] || \
+    [ ! "$(grep '"kmer"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$KMER" ] || \
+    [ ! -d "${DB_LOCAL}/ariba_db" ] || \
+    [ ! -d "${DB_LOCAL}/kmer_db" ] || \
+    [ ! -d "${DB_LOCAL}/streptococcus-pneumoniae-ctvdb"] || \
+    [ ! -f "${DB_LOCAL}/cd_cluster.tsv" ] || \
+    [ ! -f "${DB_LOCAL}/cdhit_cluster" ] || \
+    [ ! -f "${DB_LOCAL}/kmer_size.txt" ] || \
+    [ ! -f "${DB_LOCAL}/meta.tsv" ] || \
+    [ ! -f "${DB_LOCAL}/reference.fasta" ]; then
+
+    rm -rf "${DB_LOCAL}"
+
+    wget "${DB_REMOTE}" -O $ZIPPED_REPO
+
+    mkdir tmp
+    tar -xzf $ZIPPED_REPO --strip-components=1 -C tmp
+
+    mkdir -p "${DB_LOCAL}"
+    mv tmp/database/* "${DB_LOCAL}"
+
+    seroba createDBs "${DB_LOCAL}" "${KMER}"
+
+    rm -f $ZIPPED_REPO
+
+    echo -e "{\n  \"url\": \"$DB_REMOTE\",\n  \"kmer\": \"$KMER\",\n  \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}"
+
+fi
diff --git a/bin/check-download_kraken2_db.sh b/bin/check-download_kraken2_db.sh
@@ -0,0 +1,29 @@
+# Check if all file exists and were obtained from the database at the specific link.
+# If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON
+
+ZIPPED_DB='kraken2_db.tar.gz'
+
+if  [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
+    [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")"  ] || \
+    [ ! -f "${DB_LOCAL}/hash.k2d" ] || \
+    [ ! -f "${DB_LOCAL}/opts.k2d" ] || \
+    [ ! -f "${DB_LOCAL}/taxo.k2d" ]; then
+
+    rm -rf "${DB_LOCAL}"
+
+    wget "${DB_REMOTE}" -O $ZIPPED_DB
+
+    # Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure
+    mkdir tmp
+    tar -xzf $ZIPPED_DB -C tmp
+    mkdir -p "${DB_LOCAL}"
+    find tmp -type f -exec mv {} "$DB_LOCAL" \;
+
+    rm -f $ZIPPED_DB
+
+    jq -n \
+        --arg url "${DB_REMOTE}" \
+        --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \
+        '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}"
+
+fi
diff --git a/bin/check-download_poppunk_db.sh b/bin/check-download_poppunk_db.sh
@@ -0,0 +1,32 @@
+# Return PopPUNK database name
+
+# Check if all files exist and were obtained from the database at the specific link.
+# If not: remove all sub-directories, download, and unzip to database directory, also save metadata to JSON
+
+DB_NAME=$(basename "$DB_REMOTE" .tar.gz)
+DB_PATH=${DB_LOCAL}/${DB_NAME}
+
+if  [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
+    [ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")"  ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}.h5" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}.dists.npy" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}.dists.pkl" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}_fit.npz" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}_fit.pkl" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}_graph.gt" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}_clusters.csv" ] || \
+    [ ! -f "${DB_PATH}/${DB_NAME}.refs" ]; then
+
+    rm -rf "${DB_LOCAL}"
+
+    wget "$DB_REMOTE" -O poppunk_db.tar.gz
+    mkdir -p "${DB_LOCAL}"
+    tar -xzf poppunk_db.tar.gz -C "$DB_LOCAL"
+    rm poppunk_db.tar.gz
+
+    jq -n \
+        --arg url "$DB_REMOTE" \
+        --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \
+        '{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}"
+
+fi
diff --git a/bin/check-download_poppunk_ext_clusters.sh b/bin/check-download_poppunk_ext_clusters.sh
@@ -0,0 +1,22 @@
+# Return PopPUNK External Clusters file name
+
+# Check if specific external clusters file exists and was obtained from the specific link.
+# If not: remove all csv files, and download to database directory, also save metadata to JSON
+
+EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE")
+
+if  [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \
+    [ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")"  ] || \
+    [ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then
+
+    rm -rf "${EXT_CLUSTERS_LOCAL}"
+
+    mkdir -p "${EXT_CLUSTERS_LOCAL}"
+    wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}"
+
+    jq -n \
+        --arg url "$EXT_CLUSTERS_REMOTE" \
+        --arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \
+        '{"url" : $url, "save_time": $save_time}' > "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}"
+
+fi
diff --git a/bin/convert_sam_to_sorted_bam.sh b/bin/convert_sam_to_sorted_bam.sh
@@ -0,0 +1,11 @@
+# Convet SAM to sorted BAM file
+# Remove source SAM file if $LITE is true
+
+samtools view -@ "$(nproc)" -b "$SAM" > "$BAM"
+
+samtools sort -@ "$(nproc)" -o "$SORTED_BAM" "$BAM"
+rm "$BAM"
+
+if [ "$LITE" = true ]; then
+    rm "$(readlink -f "$SAM")"
+fi
diff --git a/bin/get_docker_compose.sh → bin/create_docker_compose.sh b/bin/get_docker_compose.sh → bin/create_docker_compose.sh
@@ -1,14 +1,14 @@
-# Generate a Docker compose file that includes all images used in nextflow.config
+# Generate a Docker compose file that includes all images used in $NEXTFLOW_CONFIG
 
 COUNT=0
 
-echo "services:" >> $COMPOSE
+echo "services:" >> "$COMPOSE"
 
-grep -E "container\s?=" $NEXTFLOW_CONFIG \
+grep -E "container\s?=" "$NEXTFLOW_CONFIG" \
     | sort -u \
     | sed -r "s/\s+container\s?=\s?'(.+)'/\1/" \
     |   while read -r IMAGE ; do
             COUNT=$((COUNT+1))
-            echo "  SERVICE${COUNT}:" >> $COMPOSE
-            echo "      image: $IMAGE" >> $COMPOSE
+            echo "  SERVICE${COUNT}:" >> "$COMPOSE"
+            echo "      image: $IMAGE" >> "$COMPOSE"
         done
diff --git a/bin/create_seroba_db.sh b/bin/create_seroba_db.sh
diff --git a/bin/generate_overall_report.py b/bin/generate_overall_report.py
@@ -0,0 +1,95 @@
+#! /usr/bin/env python3
+
+# Generate overall report based on sample reports and columns specified by COLUMNS_BY_CATEGORY and ARIBA metadata
+
+import sys
+from itertools import chain
+import pandas as pd
+import glob
+
+
+# Specify columns need to be included in the output file and their orders (except those based on ARIBA metadata)
+COLUMNS_BY_CATEGORY = {
+    'IDENTIFICATION': ['Sample_ID'],
+    'QC': ['Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC'] ,
+    'READ': ['Bases'], 
+    'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'],
+    'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'],
+    'TAXONOMY': ['S.Pneumo_%', 'Top_Non-Strep_Genus', 'Top_Non-Strep_Genus_%'],
+    'LINEAGE': ['GPSC'],
+    'SEROTYPE': ['Serotype'],
+    'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'],
+    'PBP': ['pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)']
+}
+
+
+# Check argv and save to global variables
+if len(sys.argv) != 4:
+    sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA OUTPUT_FILE')
+INPUT_PATTERN = sys.argv[1]
+ARIBA_METADATA = sys.argv[2]
+OUTPUT_FILE = sys.argv[3]
+
+
+def main():
+    output_columns = get_output_columns()
+    df_output = get_df_output(output_columns)
+
+    # Saving df_output to OUTPUT_FILE in csv format
+    df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_')
+
+
+# Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata
+def get_output_columns():
+    output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values()))
+    add_ariba_columns(output_columns)
+    return output_columns
+
+
+# Based on ARIBA metadata, add additional output columns
+def add_ariba_columns(output_columns):
+    # Get all targets in ARIBA metadata
+    ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique())
+
+    # Adding special cases if certain targets exist
+    if 'TET' in ariba_targets:
+        ariba_targets.add('DOX')
+    if 'FQ' in ariba_targets:
+        ariba_targets.add('LFX')
+    if 'TMP' in ariba_targets and 'SMX' in ariba_targets:
+        ariba_targets.add('COT')
+    if 'ERY_CLI' in ariba_targets:
+        ariba_targets.update(['ERY', 'CLI'])
+
+    # Add all targets alphabetically, except always adding PILI at the end
+    pilis = []
+    for target in sorted(ariba_targets):
+        if target.lower().startswith('pili'):
+            pilis.append(target)
+        else:
+            output_columns.extend([f'{target}_Res', f'{target}_Determinant'])
+    for pili in pilis:
+        output_columns.extend([f'{pili}', f'{pili}_Determinant'])
+
+
+# Generating df_output based on all sample reports with columns in the order of output_columns
+def get_df_output(output_columns):
+    # Generate an empty dataframe as df_manifest based on output_columns
+    df_manifest = pd.DataFrame(columns=output_columns)
+
+    # Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output 
+    dfs = [df_manifest]
+    reports = glob.glob(INPUT_PATTERN)
+    for report in reports:
+        df = pd.read_csv(report, dtype=str)
+        dfs.append(df)
+    df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID'])
+
+    # Ensure column order in df_output is the same as output_columns
+    df_output = df_output[output_columns]
+
+    return df_output
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/generate_sample_report.sh b/bin/generate_sample_report.sh
@@ -0,0 +1,5 @@
+# Combine all csv reports into a single csv, then add Sample_ID as the first field
+
+paste -d , ${SAMPLE_ID}_process_report_*.csv \
+| sed '1 s/^/\"Sample_ID\",/' \
+| sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT"
diff --git a/bin/get_assembly_qc.sh b/bin/get_assembly_qc.sh
@@ -0,0 +1,14 @@
+# Extract assembly QC information and determine QC result based on report.tsv from Quast, total base count
+
+CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' "$REPORT")
+LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' "$REPORT")
+DEPTH=$(echo "scale=2; $BASES / $LENGTH" | bc -l)
+
+if [[ $CONTIGS -le $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then
+    ASSEMBLY_QC="PASS"
+else
+    ASSEMBLY_QC="FAIL"
+fi
+
+echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > "$ASSEMBLY_QC_REPORT"
+echo \""$ASSEMBLY_QC"\",\""$CONTIGS"\",\""$LENGTH"\",\""$DEPTH"\" >> "$ASSEMBLY_QC_REPORT"