forked from wwood/CoverM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
validate.R
22 lines (17 loc) · 1.52 KB
/
validate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
library(data.table)
contigs = fread('cargo run --release -- contig -b v80.public.B06.V06.A.pos.lim.R1stq_filtered.bam',header=T)
genomes = fread('cargo run --release -- genome -s . -b v80.public.B06.V06.A.pos.lim.R1stq_filtered.bam --min-covered-fraction 0 -m mean',header=T)
genomes_coverage_hist = fread('cargo run --release -- genome -s . -b v80.public.B06.V06.A.pos.lim.R1stq_filtered.bam --min-covered-fraction 0 -m coverage_histogram', header=T)
setnames(contigs, c('sample','contig','coverage'))
setnames(genomes, c('sample','genome','mean_coverage'))
setnames(genomes_coverage_hist, c('sample','genome','coverage','count'))
stopifnot(
nrow(merge(genomes_coverage_hist[, .(calculated_mean_coverage=sum(coverage*count)/sum(count)), by='genome'], genomes[mean_coverage > 0], all=T, by='genome')[abs(calculated_mean_coverage - mean_coverage)>0.0001]) == 0)
## Are the correct lengths of the genomes found?
lengths = fread('samtools view -H v80.public.B06.V06.A.pos.lim.R1stq_filtered.bam |grep -v \'^@PG\'',header=F)
l2 = lengths[2:nrow(lengths), .(contig = gsub('SN:','',V2), contig_length=as.numeric(gsub('LN:','',V3)))]
l2[, genome := gsub('\\..*','',contig)]
l2[, .(genome_length=sum(contig_length)), by=genome]
stopifnot(nrow(merge(l2[, .(genome_length=sum(contig_length)), by=genome], genomes_coverage_hist[,.(coverage_hist_len=sum(count)), by=genome], by='genome',all=T)[!is.na(coverage_hist_len)][genome_length != coverage_hist_len]) == 0)
## Are there the correct number of contigs in the contig output?
stopifnot(nrow(contigs) == nrow(l2))