-
Notifications
You must be signed in to change notification settings - Fork 3
/
TargetSeqQC.pl
238 lines (220 loc) · 8.53 KB
/
TargetSeqQC.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use List::Util qw(sum);
use File::Basename;
#################################
# This takes a bam file as input and generates the coverage statistics on it.
#
#################################
my $BED = '';
my $BAM = '';
my $OUTFILE = '';
my @Interval;
my $SUM = '0';
my $ALL = '1';
my $help;
GetOptions(
'bed=s' =>\$BED,
'bam=s' =>\$BAM,
'all=s' =>\$ALL,
'sum=s' =>\$SUM,
'int=s' =>\@Interval,
'out=s' =>\$OUTFILE,
'help|h' =>\$help,
)or pod2usage();
$help and pod2usage ();
if (!$BED){
print STDERR "bed file is required -bed /data/khanlab/ref/GATK/hg19/RMS_Amplicon.bedtools.bed\n";
die;
}
if (!$BAM){
print STDERR "bam file is required -bam /data/khanlab/projects/working_DATA/Sample_RMS222_A_H3FHYAFXX/Sample_RMS222_A_H3FHYAFXX.trim.bam\n";
die;
}
if (!$OUTFILE){
print STDERR "output file is required -out /data/khanlab/projects/working_DATA/Sample_RMS222_A_H3FHYAFXX/Sample_RMS222_A_H3FHYAFXX.stats.txt\n";
die;
}
if (!@Interval){
print STDERR "You have to use argument -int atleast once.\n";
die;
}
########################
# Main
########################
unless(open (IN, $BED)){
print "Can not open the Bed file $BED\n";
exit;
}
unless (open(OUT, ">$OUTFILE")){
print "Can not open the output file $OUTFILE\nPlease check permissions\n\n ";
exit;
}
my $tmp = basename($BAM, ".bam");
my $colsinBED=`head -1 $BED |awk -F "\t" '{print NF}'`;
$colsinBED = $colsinBED - 3;
print OUT "Chr\tStart\tStop";
foreach(1..$colsinBED){
print OUT "\tHead$_";
}
#\tAmplicon\tTarget\tLength\tStrand\tGene\t
print OUT "\tBamfile\tLength\t\%Region Covered\tMin\tMax\tMean\tMedian\tQ1\tQ3";
foreach(@Interval){
print OUT "\t% min X$_ Coverage";
}
print OUT "\n";
###################################
# Aggregate Stats on whole bed file
###################################
if($SUM eq 1){
system("cut -f 1-3 $BED |sortBed -i - |mergeBed -i - |coverageBed -d -abam $BAM -b - >/scratch/$tmp.covinfo.all");
my $line = `cut -f 5 /scratch/$tmp.covinfo.all |awk '{if(\$1 >0)print \$0}'`;
my @data = split("\n", $line);
my $size = `cut -f 1-3 $BED |sortBed -i - |mergeBed -i - |awk -F'\t' 'BEGIN{SUM=0}{ SUM+=\$3-\$2 }END{print SUM}'`;
chomp $size;
if($size eq ($#data + 1 )){ # All bases have coverage
@data =sort{$a <=> $b}(@data);
my $mean = mean(@data);
my $median= Median(\@data);
print OUT "Entire Bed Region\t-\t-";
foreach(1..$colsinBED){
print OUT "\trecord$_";
}
print OUT "\t$tmp\t$size\t100\t$data[0]\t$data[$#data]\t$mean\t$median\t$data[$#data/4]\t$data[3*($#data)/4]";
foreach(@Interval){
my $X = Coverage($_, @data);
print OUT "\t$X";
}
print OUT "\n";
}
elsif($#data <0){ # Failed completely
print OUT "Entire Bed Region\t-\t-";
foreach(1..$colsinBED){
print OUT "\trecord$_";
}
print OUT "\t$tmp\t$size\t0\t0\t0\t0\t0\t0\t0";
foreach(@Interval){
print OUT "\t0";
}
print OUT "\n";
}
else{ # Only some bases failed
my $cov = sprintf "%.2f", (($#data - 1)/$size)*100;
for (my $i=$#data; $i<$size; $i++){
push @data, 0;
}
@data =sort{$a <=> $b}(@data);
my $mean = mean(@data);
my $median= Median(\@data);
print OUT "Entire Bed Region\t-\t-";
foreach(1..$colsinBED){
print OUT "\trecord$_";
}
print OUT "\t$tmp\t$size\t$cov\t$data[0]\t$data[$#data]\t$mean\t$median\t$data[$#data/4]\t$data[3*($#data)/4]";
foreach(@Interval){
my $X = Coverage($_, @data);
print OUT "\t$X";
}
print OUT "\n";
}
}
###################################
# Stats on every region in bed file
###################################
if ($ALL eq 1){
system("cut -f 1-3 $BED |coverageBed -d -abam $BAM -b - >/scratch/$tmp.covinfo");
while(<IN>){
chomp;
my @local = split ("\t", $_);
my $line = `grep -P "$local[0]\t$local[1]\t$local[2]" /scratch/$tmp.covinfo |cut -f 5 |awk '{if(\$1 >0)print \$0}'`;
my @data = split("\n", $line);
my $size = $local[2] - $local[1];
if($size eq ($#data + 1)){ # All bases have coverage
@data =sort{$a <=> $b}(@data);
my $mean = mean(@data);
my $median= Median(\@data);
print OUT "$_\t$tmp\t$size\t100\t$data[0]\t$data[$#data]\t$mean\t$median\t$data[$#data/4]\t$data[3*($#data)/4]";
foreach(@Interval){
my $X = Coverage($_, @data);
print OUT "\t$X";
}
print OUT "\n";
}
elsif($#data <0){ # Failed completely
print OUT "$_\t$tmp\t$size\t0\t0\t0\t0\t0\t0\t0";
foreach(@Interval){
print OUT "\t0";
}
print OUT "\n";
}
else{ # Only some bases failed
my $cov = sprintf "%.2f", (($#data - 1)/$size)*100;
for (my $i=$#data; $i<$size; $i++){
push @data, 0;
}
@data =sort{$a <=> $b}(@data);
my $mean = mean(@data);
my $median= Median(\@data);
print OUT "$_\t$tmp\t$size\t$cov\t$data[0]\t$data[$#data]\t$mean\t$median\t$data[$#data/4]\t$data[3*($#data)/4]";
foreach(@Interval){
my $X = Coverage($_, @data);
print OUT "\t$X";
}
print OUT "\n";
}
}
close IN;
close OUT;
}
#unlink("/scratch/$tmp.covinfo.all");
#unlink("/scratch/$tmp.covinfo");
#######################
# END of MAIN
#######################
sub Coverage{
my ($cov, @arr) = @_;
my $out =0;
foreach my $number(@arr){
if($number >= $cov){
$out++;
}
}
$out = sprintf "%.2f", ($out/($#arr + 1))*100;
return $out;
}
sub mean{
my $mean = @_ ? sum(@_) / @_ : 0;
$mean = sprintf "%.0f", $mean;
return $mean;
}
sub Median{
my ($refdata) = @_;
my $median;
@$refdata = sort{$a<=>$b}@$refdata;
my $count = @$refdata;
if ($count %2){
$median = $$refdata[int($count/2)];
}
else{
$median = ($$refdata[$count/2]+ $$refdata[$count/2 -1])/2;
}
$median =sprintf "%.0f", $median;
return $median;
}
=head1 SYNOPSIS
$0 -bed /data/khanlab/ref/GATK/hg19/RMS_Amplicon.bedtools.bed -bam /data/khanlab/projects/working_DATA/Sample_RMS222_A_H3FHYAFXX/Sample_RMS222_A_H3FHYAFXX.trim.bam -out Sample_RMS222_A_H3FHYAFXX.QC.txt -int 10 -int 100 -int 500 -int 1000 -sum 1
Usage:
-h, -help, --help Print this message.
-bed Bed file containing the locations where statistics to be generated. (Required)
-bam Bam file on which the you whould like to generate the statistics, should be indexed. (Required)
-all 0 if you dont want the statistics to be generated for every region in bed file.
-all 0 -sum 1 will generate aggregate stat only. (Default generate stat on all positions)
-sum 1 if you want aggregate entry for the whole bed file. (Default does not aggregate.)
Overlapping regions in bed file will be merged to avoid overcounting.
-int %of Based covered with min intX. Can be specified multiple times. (Required)
-out Output file name. (Required)
For questions or comments, please contact: Rajesh Patidar <rajbtpatidar@gmail.com>
=cut