Skip to content

Commit

Permalink
better handle empty files and invalid formatted files
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Jul 22, 2019
1 parent f3cecc7 commit 39513f2
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 46 deletions.
114 changes: 68 additions & 46 deletions assembly-scan.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#! /usr/bin/env python3
"""Produce basic assembly stats for a given assembly."""
VERSION = 0.2
VERSION = "0.3.0"

def open_fasta(filename):
"""Return filehandle depending on file extension."""
Expand All @@ -10,21 +10,39 @@ def open_fasta(filename):
return open(filename, 'r')


def default_stats():
"""Return an default dict for assembly stats."""
keys = [
"contig_percent_a", "contig_percent_c", "contig_percent_g",
"contig_percent_t", "contig_percent_n", "contig_non_acgtn",
"contigs_greater_100k", "contigs_greater_10k", "contigs_greater_1k",
"contigs_greater_1m", "l50_contig_count", "max_contig_length",
"mean_contig_length", "median_contig_length", "min_contig_length",
"n50_contig_length", "num_contig_non_acgtn", "percent_contigs_greater_100k",
"percent_contigs_greater_10k", "percent_contigs_greater_1k",
"percent_contigs_greater_1m", "total_contig", "total_contig_length"
]
return dict(zip(keys, [0] * len(keys)))


def read_fasta(input_fasta):
"""Return a list of seqeunces from a given FASTA file."""
try:
seq = []
records = []
is_fasta = False
with open_fasta(input_fasta) as fasta_fh:
for line in fasta_fh:
line = line.rstrip()
if line.startswith('>'):
is_fasta = True
if seq:
records.append(''.join(seq))
seq = []
else:
elif is_fasta:
seq.append(line)
records.append(''.join(seq))
if is_fasta:
records.append(''.join(seq))
return records
except IOError as error:
raise RuntimeError("Error opening assembly.") from error
Expand Down Expand Up @@ -109,47 +127,51 @@ def print_percent(val):
args = parser.parse_args()

# Read FASTA
fasta = read_fasta(args.assembly)

# Generate Stats
lengths = sorted([len(seq) for seq in fasta], key=int, reverse=True)
length_totals = contig_lengths(lengths)
total_contig = len(fasta)
total_bp = sum(lengths)
usage = nucleotide_usage(fasta, total_bp)
n50 = calculate_n50(lengths, total_bp)

stats = {
"contig_percent_a": print_percent(usage['a']),
"contig_percent_c": print_percent(usage['c']),
"contig_percent_g": print_percent(usage['g']),
"contig_percent_t": print_percent(usage['t']),
"contig_percent_n": print_percent(usage['n']),
"contig_non_acgtn": print_percent(usage['non_acgtn']),
"contigs_greater_100k": length_totals['100k'],
"contigs_greater_10k": length_totals['10k'],
"contigs_greater_1k": length_totals['1k'],
"contigs_greater_1m": length_totals['1m'],
"l50_contig_count": n50['l50'],
"max_contig_length": max(lengths),
"mean_contig_length": int(mean(lengths)),
"median_contig_length": int(median(lengths)),
"min_contig_length": min(lengths),
"n50_contig_length": n50['n50'],
"num_contig_non_acgtn": usage['total_non_acgtn'],
"percent_contigs_greater_100k": print_percent(
length_totals['100k'] / total_contig
),
"percent_contigs_greater_10k": print_percent(
length_totals['10k'] / total_contig
),
"percent_contigs_greater_1k": print_percent(
length_totals['1k'] / total_contig
),
"percent_contigs_greater_1m": print_percent(
length_totals['1m'] / total_contig
),
"total_contig": total_contig,
"total_contig_length": total_bp
}
fasta = list(filter(None, read_fasta(args.assembly)))
stats = default_stats()

if fasta:
# Generate Stats
lengths = sorted([len(seq) for seq in fasta], key=int, reverse=True)
length_totals = contig_lengths(lengths)
total_contig = len(fasta)
total_bp = sum(lengths)
usage = nucleotide_usage(fasta, total_bp)
n50 = calculate_n50(lengths, total_bp)

stats = {
"contig_percent_a": print_percent(usage['a']),
"contig_percent_c": print_percent(usage['c']),
"contig_percent_g": print_percent(usage['g']),
"contig_percent_t": print_percent(usage['t']),
"contig_percent_n": print_percent(usage['n']),
"contig_non_acgtn": print_percent(usage['non_acgtn']),
"contigs_greater_100k": length_totals['100k'],
"contigs_greater_10k": length_totals['10k'],
"contigs_greater_1k": length_totals['1k'],
"contigs_greater_1m": length_totals['1m'],
"l50_contig_count": n50['l50'],
"max_contig_length": max(lengths),
"mean_contig_length": int(mean(lengths)),
"median_contig_length": int(median(lengths)),
"min_contig_length": min(lengths),
"n50_contig_length": n50['n50'],
"num_contig_non_acgtn": usage['total_non_acgtn'],
"percent_contigs_greater_100k": print_percent(
length_totals['100k'] / total_contig
),
"percent_contigs_greater_10k": print_percent(
length_totals['10k'] / total_contig
),
"percent_contigs_greater_1k": print_percent(
length_totals['1k'] / total_contig
),
"percent_contigs_greater_1m": print_percent(
length_totals['1m'] / total_contig
),
"total_contig": total_contig,
"total_contig_length": total_bp
}
else:
stats["comment"] = "Invalid format, or empty FASTA. Please verify."
print(json.dumps(stats, indent=4, sort_keys=True))
2 changes: 2 additions & 0 deletions test/assembly.fna
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>example-header
ATGCATCGTACGTCGTACGTACGTACGTCGATGCTCGTAGCTTCGATCGCTGATCGACTGT
Empty file added test/empty-file.txt
Empty file.
1 change: 1 addition & 0 deletions test/invalid-fasta.fna
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
>some-header
10 changes: 10 additions & 0 deletions test/invalid-format.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/assembly-scan/README.html)
[![Docker Repository on Quay.io](https://quay.io/repository/biocontainers/assembly-scan/status "Docker Repository on Quay.io")](https://quay.io/repository/biocontainers/assembly-scan)
[![Anaconda-Server Badge](https://anaconda.org/bioconda/assembly-scan/badges/downloads.svg)](https://anaconda.org/bioconda/assembly-scan)


*assembly-scan* reads an assembly in FASTA format and outputs summary statistics in JSON format.

# assembly-scan
I wanted a quick method to output simple summary statistics of an input assembly in JSON format. There are alternatives including [assemblathon-stats.pl](https://github.com/ucdavis-bioinformatics/assemblathon2-analysis) and [assembly-stats](https://github.com/sanger-pathogens/assembly-stats), but they didn't ouput JSON which I wanted. There are examples below on which stats are output below.

5 changes: 5 additions & 0 deletions test/wierd-fasta.fna
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>1
ATGCATGACTGCATGACTGACTGACTGACTGTCGTACGTG
>2
>3
ASTACGTAGTGCATGCTGSTAGSTGASTASTAGCTGATGT

0 comments on commit 39513f2

Please sign in to comment.