Skip to content

Commit

Permalink
Merge pull request #82 from HarryHung/dev
Browse files Browse the repository at this point in the history
Release Version 1.0.0-rc2

Former-commit-id: 3618ab5
  • Loading branch information
HarryHung authored Nov 17, 2023
2 parents b722a26 + 2d38b19 commit 02885cc
Show file tree
Hide file tree
Showing 77 changed files with 3,444 additions and 1,016 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ work
databases
input
output
test_input
*.html

# Singularity cache
Expand Down
208 changes: 117 additions & 91 deletions README.md

Large diffs are not rendered by default.

11 changes: 0 additions & 11 deletions bin/assembly_qc.sh

This file was deleted.

8 changes: 8 additions & 0 deletions bin/call_snp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Call SNPs and save to .vcf
# Remove source sorted BAM file if $LITE is true

bcftools mpileup --threads "$(nproc)" -f "$REFERENCE" "$SORTED_BAM" | bcftools call --threads "$(nproc)" -mv -O v -o "$VCF"

if [ "$LITE" = true ]; then
rm "$(readlink -f "$SORTED_BAM")"
fi
33 changes: 33 additions & 0 deletions bin/check-create_ariba_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Check if ARIBA database was prepared from the specific reference sequences and metadata.
# If not: remove the $OUTPUT directory, and prepare the ARIBA database from reference sequences and metadata, also save metadata to JSON

REF_SEQUENCES_MD5=$(md5sum "$REF_SEQUENCES" | awk '{ print $1 }')
METADATA_MD5=$(md5sum "$METADATA" | awk '{ print $1 }')

if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
[ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES" ] || \
[ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REF_SEQUENCES_MD5" ] || \
[ ! "$(grep '"metadata"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA" ] || \
[ ! "$(grep '"metadata_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$METADATA_MD5" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/00.info.txt" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/00.version_info.txt" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_genes.log" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.log" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_metadata.tsv" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/01.filter.check_noncoding.log" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.all.fa" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.pickle" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.clusters.tsv" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.fa" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.gene.varonly.fa" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.fa" ] || \
[ ! -f "${DB_LOCAL}/${OUTPUT}/02.cdhit.noncoding.varonly.fa" ] ; then

rm -rf "${DB_LOCAL}"

mkdir -p "${DB_LOCAL}"
ariba prepareref -f "$REF_SEQUENCES" -m "$METADATA" "${DB_LOCAL}/${OUTPUT}"

echo -e "{\n \"reference\": \"$REF_SEQUENCES\",\n \"reference_md5\": \"$REF_SEQUENCES_MD5\",\n \"metadata\": \"$METADATA\",\n \"metadata_md5\": \"$METADATA_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}"

fi
24 changes: 24 additions & 0 deletions bin/check-create_ref_genome_bwa_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Check if BWA database was prepared from the specific reference.
# If not: remove files in database directory, and construct the FM-index database of the reference genome for BWA, also save metadata to JSON

REFERENCE_MD5=$(md5sum "$REFERENCE" | awk '{ print $1 }')

if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
[ ! "$(grep '"reference"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE" ] || \
[ ! "$(grep '"reference_md5"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$REFERENCE_MD5" ] || \
[ ! -f "${DB_LOCAL}/${PREFIX}.amb" ] || \
[ ! -f "${DB_LOCAL}/${PREFIX}.ann" ] || \
[ ! -f "${DB_LOCAL}/${PREFIX}.bwt" ] || \
[ ! -f "${DB_LOCAL}/${PREFIX}.pac" ] || \
[ ! -f "${DB_LOCAL}/${PREFIX}.sa" ] ; then

rm -rf "${DB_LOCAL}"

bwa index -p "$PREFIX" "$REFERENCE"

mkdir -p "${DB_LOCAL}"
mv "${PREFIX}.amb" "${PREFIX}.ann" "${PREFIX}.bwt" "${PREFIX}.pac" "${PREFIX}.sa" -t "${DB_LOCAL}"

echo -e "{\n \"reference\": \"$REFERENCE\",\n \"reference_md5\": \"$REFERENCE_MD5\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}"

fi
34 changes: 34 additions & 0 deletions bin/check-create_seroba_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Check if database was downloaded from specific link, also prepared by the specific Kmer
# If not: remove files in database directory and download, re-create KMC and ARIBA databases, also save metadata to JSON

ZIPPED_REPO='seroba.tar.gz'

if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
[ ! "$(grep '"url"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$DB_REMOTE" ] || \
[ ! "$(grep '"kmer"' "${DB_LOCAL}/${JSON_FILE}" | sed -r 's/.+: "(.*)",?/\1/')" == "$KMER" ] || \
[ ! -d "${DB_LOCAL}/ariba_db" ] || \
[ ! -d "${DB_LOCAL}/kmer_db" ] || \
[ ! -d "${DB_LOCAL}/streptococcus-pneumoniae-ctvdb"] || \
[ ! -f "${DB_LOCAL}/cd_cluster.tsv" ] || \
[ ! -f "${DB_LOCAL}/cdhit_cluster" ] || \
[ ! -f "${DB_LOCAL}/kmer_size.txt" ] || \
[ ! -f "${DB_LOCAL}/meta.tsv" ] || \
[ ! -f "${DB_LOCAL}/reference.fasta" ]; then

rm -rf "${DB_LOCAL}"

wget "${DB_REMOTE}" -O $ZIPPED_REPO

mkdir tmp
tar -xzf $ZIPPED_REPO --strip-components=1 -C tmp

mkdir -p "${DB_LOCAL}"
mv tmp/database/* "${DB_LOCAL}"

seroba createDBs "${DB_LOCAL}" "${KMER}"

rm -f $ZIPPED_REPO

echo -e "{\n \"url\": \"$DB_REMOTE\",\n \"kmer\": \"$KMER\",\n \"create_time\": \"$(date +"%Y-%m-%d %H:%M:%S %Z")\"\n}" > "${DB_LOCAL}/${JSON_FILE}"

fi
29 changes: 29 additions & 0 deletions bin/check-download_kraken2_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Check if all file exists and were obtained from the database at the specific link.
# If not: remove files in database directory, download, and unzip to database directory, also save metadata to JSON

ZIPPED_DB='kraken2_db.tar.gz'

if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
[ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \
[ ! -f "${DB_LOCAL}/hash.k2d" ] || \
[ ! -f "${DB_LOCAL}/opts.k2d" ] || \
[ ! -f "${DB_LOCAL}/taxo.k2d" ]; then

rm -rf "${DB_LOCAL}"

wget "${DB_REMOTE}" -O $ZIPPED_DB

# Use tmp dir and find to ensure files are saved directly at $DB_LOCAL regardless of archive directory structure
mkdir tmp
tar -xzf $ZIPPED_DB -C tmp
mkdir -p "${DB_LOCAL}"
find tmp -type f -exec mv {} "$DB_LOCAL" \;

rm -f $ZIPPED_DB

jq -n \
--arg url "${DB_REMOTE}" \
--arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \
'{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}"

fi
32 changes: 32 additions & 0 deletions bin/check-download_poppunk_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Return PopPUNK database name

# Check if all files exist and were obtained from the database at the specific link.
# If not: remove all sub-directories, download, and unzip to database directory, also save metadata to JSON

DB_NAME=$(basename "$DB_REMOTE" .tar.gz)
DB_PATH=${DB_LOCAL}/${DB_NAME}

if [ ! -f "${DB_LOCAL}/${JSON_FILE}" ] || \
[ ! "$DB_REMOTE" == "$(jq -r .url "${DB_LOCAL}/${JSON_FILE}")" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}.h5" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}.dists.npy" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}.dists.pkl" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}_fit.npz" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}_fit.pkl" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}_graph.gt" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}_clusters.csv" ] || \
[ ! -f "${DB_PATH}/${DB_NAME}.refs" ]; then

rm -rf "${DB_LOCAL}"

wget "$DB_REMOTE" -O poppunk_db.tar.gz
mkdir -p "${DB_LOCAL}"
tar -xzf poppunk_db.tar.gz -C "$DB_LOCAL"
rm poppunk_db.tar.gz

jq -n \
--arg url "$DB_REMOTE" \
--arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \
'{"url" : $url, "save_time": $save_time}' > "${DB_LOCAL}/${JSON_FILE}"

fi
22 changes: 22 additions & 0 deletions bin/check-download_poppunk_ext_clusters.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Return PopPUNK External Clusters file name

# Check if specific external clusters file exists and was obtained from the specific link.
# If not: remove all csv files, and download to database directory, also save metadata to JSON

EXT_CLUSTERS_CSV=$(basename "$EXT_CLUSTERS_REMOTE")

if [ ! -f "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}" ] || \
[ ! "$EXT_CLUSTERS_REMOTE" == "$(jq -r .url "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}")" ] || \
[ ! -f "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}" ]; then

rm -rf "${EXT_CLUSTERS_LOCAL}"

mkdir -p "${EXT_CLUSTERS_LOCAL}"
wget "$EXT_CLUSTERS_REMOTE" -O "${EXT_CLUSTERS_LOCAL}/${EXT_CLUSTERS_CSV}"

jq -n \
--arg url "$EXT_CLUSTERS_REMOTE" \
--arg save_time "$(date +"%Y-%m-%d %H:%M:%S %Z")" \
'{"url" : $url, "save_time": $save_time}' > "${EXT_CLUSTERS_LOCAL}/${JSON_FILE}"

fi
11 changes: 11 additions & 0 deletions bin/convert_sam_to_sorted_bam.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Convet SAM to sorted BAM file
# Remove source SAM file if $LITE is true

samtools view -@ "$(nproc)" -b "$SAM" > "$BAM"

samtools sort -@ "$(nproc)" -o "$SORTED_BAM" "$BAM"
rm "$BAM"

if [ "$LITE" = true ]; then
rm "$(readlink -f "$SAM")"
fi
10 changes: 5 additions & 5 deletions bin/get_docker_compose.sh → bin/create_docker_compose.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Generate a Docker compose file that includes all images used in nextflow.config
# Generate a Docker compose file that includes all images used in $NEXTFLOW_CONFIG

COUNT=0

echo "services:" >> $COMPOSE
echo "services:" >> "$COMPOSE"

grep -E "container\s?=" $NEXTFLOW_CONFIG \
grep -E "container\s?=" "$NEXTFLOW_CONFIG" \
| sort -u \
| sed -r "s/\s+container\s?=\s?'(.+)'/\1/" \
| while read -r IMAGE ; do
COUNT=$((COUNT+1))
echo " SERVICE${COUNT}:" >> $COMPOSE
echo " image: $IMAGE" >> $COMPOSE
echo " SERVICE${COUNT}:" >> "$COMPOSE"
echo " image: $IMAGE" >> "$COMPOSE"
done
9 changes: 0 additions & 9 deletions bin/create_seroba_db.sh

This file was deleted.

95 changes: 95 additions & 0 deletions bin/generate_overall_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#! /usr/bin/env python3

# Generate overall report based on sample reports and columns specified by COLUMNS_BY_CATEGORY and ARIBA metadata

import sys
from itertools import chain
import pandas as pd
import glob


# Specify columns need to be included in the output file and their orders (except those based on ARIBA metadata)
COLUMNS_BY_CATEGORY = {
'IDENTIFICATION': ['Sample_ID'],
'QC': ['Read_QC' , 'Assembly_QC' , 'Mapping_QC' , 'Taxonomy_QC' , 'Overall_QC'] ,
'READ': ['Bases'],
'ASSEMBLY': ['Contigs#' , 'Assembly_Length' , 'Seq_Depth'],
'MAPPING': ['Ref_Cov_%' , 'Het-SNP#'],
'TAXONOMY': ['S.Pneumo_%', 'Top_Non-Strep_Genus', 'Top_Non-Strep_Genus_%'],
'LINEAGE': ['GPSC'],
'SEROTYPE': ['Serotype'],
'MLST': ['ST' , 'aroE' , 'gdh' , 'gki' , 'recP' , 'spi' , 'xpt' , 'ddl'],
'PBP': ['pbp1a' , 'pbp2b' , 'pbp2x' , 'AMO_MIC' , 'AMO_Res' , 'CFT_MIC' , 'CFT_Res(Meningital)' , 'CFT_Res(Non-meningital)' , 'TAX_MIC' , 'TAX_Res(Meningital)' , 'TAX_Res(Non-meningital)' , 'CFX_MIC' , 'CFX_Res' , 'MER_MIC' , 'MER_Res' , 'PEN_MIC' , 'PEN_Res(Meningital)' , 'PEN_Res(Non-meningital)']
}


# Check argv and save to global variables
if len(sys.argv) != 4:
sys.exit('Usage: generate_overall_report.py INPUT_PATTERN ARIBA_METADATA OUTPUT_FILE')
INPUT_PATTERN = sys.argv[1]
ARIBA_METADATA = sys.argv[2]
OUTPUT_FILE = sys.argv[3]


def main():
output_columns = get_output_columns()
df_output = get_df_output(output_columns)

# Saving df_output to OUTPUT_FILE in csv format
df_output.to_csv(OUTPUT_FILE, index=False, na_rep='_')


# Get output columns based on COLUMNS_BY_CATEGORY and ARIBA metadata
def get_output_columns():
output_columns = list(chain.from_iterable(COLUMNS_BY_CATEGORY.values()))
add_ariba_columns(output_columns)
return output_columns


# Based on ARIBA metadata, add additional output columns
def add_ariba_columns(output_columns):
# Get all targets in ARIBA metadata
ariba_targets = set(pd.read_csv(ARIBA_METADATA, sep='\t')['target'].unique())

# Adding special cases if certain targets exist
if 'TET' in ariba_targets:
ariba_targets.add('DOX')
if 'FQ' in ariba_targets:
ariba_targets.add('LFX')
if 'TMP' in ariba_targets and 'SMX' in ariba_targets:
ariba_targets.add('COT')
if 'ERY_CLI' in ariba_targets:
ariba_targets.update(['ERY', 'CLI'])

# Add all targets alphabetically, except always adding PILI at the end
pilis = []
for target in sorted(ariba_targets):
if target.lower().startswith('pili'):
pilis.append(target)
else:
output_columns.extend([f'{target}_Res', f'{target}_Determinant'])
for pili in pilis:
output_columns.extend([f'{pili}', f'{pili}_Determinant'])


# Generating df_output based on all sample reports with columns in the order of output_columns
def get_df_output(output_columns):
# Generate an empty dataframe as df_manifest based on output_columns
df_manifest = pd.DataFrame(columns=output_columns)

# Generate a dataframe for each sample report and then concat df_manifest and all dataframes into df_output
dfs = [df_manifest]
reports = glob.glob(INPUT_PATTERN)
for report in reports:
df = pd.read_csv(report, dtype=str)
dfs.append(df)
df_output = pd.concat(dfs, ignore_index=True).sort_values(by=['Sample_ID'])

# Ensure column order in df_output is the same as output_columns
df_output = df_output[output_columns]

return df_output


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions bin/generate_sample_report.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Combine all csv reports into a single csv, then add Sample_ID as the first field

paste -d , ${SAMPLE_ID}_process_report_*.csv \
| sed '1 s/^/\"Sample_ID\",/' \
| sed "2 s/^/\"${SAMPLE_ID}\",/" > "$SAMPLE_REPORT"
14 changes: 14 additions & 0 deletions bin/get_assembly_qc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Extract assembly QC information and determine QC result based on report.tsv from Quast, total base count

CONTIGS=$(awk -F'\t' '$1 == "# contigs (>= 0 bp)" { print $2 }' "$REPORT")
LENGTH=$(awk -F'\t' '$1 == "Total length" { print $2 }' "$REPORT")
DEPTH=$(echo "scale=2; $BASES / $LENGTH" | bc -l)

if [[ $CONTIGS -le $QC_CONTIGS ]] && [[ $LENGTH -ge $QC_LENGTH_LOW ]] && [[ $LENGTH -le $QC_LENGTH_HIGH ]] && [[ "$(echo "$DEPTH >= $QC_DEPTH" | bc -l)" == 1 ]]; then
ASSEMBLY_QC="PASS"
else
ASSEMBLY_QC="FAIL"
fi

echo \"Assembly_QC\",\"Contigs#\",\"Assembly_Length\",\"Seq_Depth\" > "$ASSEMBLY_QC_REPORT"
echo \""$ASSEMBLY_QC"\",\""$CONTIGS"\",\""$LENGTH"\",\""$DEPTH"\" >> "$ASSEMBLY_QC_REPORT"
Loading

0 comments on commit 02885cc

Please sign in to comment.