-
Notifications
You must be signed in to change notification settings - Fork 7
/
intersectlookup.pl
executable file
·288 lines (272 loc) · 9.78 KB
/
intersectlookup.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/perl
$usage .= "$0 - look up GFF intersect tags\n";
$usage .= "\n";
$usage .= "Usage: $0 [-self] [-image] [-unique] [-evalfile expr] [-quiet] [-showtogether] [-maxscore|-minscore] [-first|-maxlen|-minpairdistance|-maxsort expr] [<GFF files with intersect tags>]\n";
$usage .= "\n";
$usage .= "group field in <GFF file> should have intersect tags of the form \"intersect(filename)=(1 5 6 19)\"\n";
$usage .= " ... this will print lines 1, 5, 6 and 19 of <filename>\n";
$usage .= "\n";
$usage .= "Use the -evalfile switch to perform operations (eg substitutions) on <filename> before printing;\n";
$usage .= "one of these expressions must evaluate to 1 or the intersect tag will be discarded,\n";
$usage .= "so -evalfile can also be used for tests.\n";
$usage .= "\n";
$usage .= "Use the -self switch in conjunction with \"gffintersect.pl -self\" (equivalent to \"-quiet -evalfile s/^self&//\")\n";
$usage .= "\n";
$usage .= "Use the -image switch to find the \"image\" of <filename>'s GFFs, assuming this is a GFF-pair\n";
$usage .= "\n";
$usage .= "Use the -quiet switch to suppress the addition of intersect()=() information to the output\n";
$usage .= "\n";
$usage .= "Use the -showtogether switch to show the input GFF lines as well (some lines may appear more than once)\n";
$usage .= "\n";
$usage .= "Use the -maxscore switch to only print the highest-scoring hit for a line,\n";
$usage .= "Use -minscore to only print the lowest-scoring hit for a line,\n";
$usage .= "Use -first to only print the max-starting-position hit for a line,\n";
$usage .= "Use -maxlen to only print the longest hit for a line,\n";
$usage .= "Use -minpairdistance to only print the closest-pair hit for a line,\n";
$usage .= "Use the more generic -maxsort to specify a sorting criterion, with \@a and \@b holding the GFF fields\n";
$usage .= " e.g. -maxscore is equivalent to \"-maxsort '\$b[5]<=>\$a[5]'\"\n";
$usage .= "Use -unique to entirely reject intersect sets larger than one\n";
$usage .= "\n";
$usage .= "EXAMPLE: How to prune a GFF file of the lowest-scoring redundant entries:\n";
$usage .= " gffintersect.pl -self <filename> | intersectlookup.pl -self -maxscore\n";
$usage .= "\n";
while (@ARGV) {
last unless $ARGV[0] =~ /^-/;
$opt = lc shift;
if ($opt eq "-evalfile") { defined($evalfile = shift) or die $usage; push @evalfile, $evalfile }
elsif ($opt eq "-self") { push @evalfile, "s/^self&//"; $quiet = 1; $self = 1 }
elsif ($opt eq "-image") { $image = 1 }
elsif ($opt eq "-unique") { $unique = 1 }
elsif ($opt eq "-first") { $sort = "bystartpos" }
elsif ($opt eq "-maxscore") { $sort = "bymaxscore" }
elsif ($opt eq "-minscore") { $sort = "byminscore" }
elsif ($opt eq "-maxlen") { $sort = "bylen" }
elsif ($opt eq "-mergeset") { $mergeSet = 1 }
elsif ($opt eq "-minpairdistance") { $sort = "bypairdistance" }
elsif ($opt eq "-maxsort") { defined($sortexpr = shift) or die $usage; $sort = "byexpr" }
elsif ($opt eq "-quiet") { $quiet = 1 }
elsif ($opt eq "-showtogether") { $showtogether = 1 }
elsif ($opt eq "-h") {die "$usage\n"}
else { die "$usage\nUnknown option: $opt\n" }
}
@ARGV>=1 or @ARGV = ("-");
foreach $file1 (@ARGV) {
open file1, $file1 or die "$0: couldn't open $file1: $!";
$line1 = 0;
while (<file1>) {
++$line1;
$lineText{$line1} = $_;
@f = split /\t/;
my @pair = split(/\s+/,$f[8]);
if ($f[6] eq "-") { @pair[1,2] = @pair[2,1] }
$set = undef;
while ((($file2,$lines2) = ($f[8] =~ /intersect\(([^\)]+)\)=\(([^\)]+)\)/)) && $file2 ne "") {
$f8sub = quotemeta "intersect($file2)=($lines2)"; # Backslash all non-alphanumeric characters
$f[8] =~ s/$f8sub//;
if (@evalfile) {
$_ = $file2;
$print = 0;
foreach $evalfile (@evalfile) {
#warn "Evaluating $evalfile on $_\n";
if (eval $evalfile) { $print = 1 }
}
next unless $print;
$file2 = $_;
}
next unless -r $file2;
#$tmpf8 = $f[8]; chomp $tmpf8; warn "file1=$file1 line1=$line1 file2=$file2 lines2=($lines2) \$f[8]='$tmpf8'\n";
@line2 = split /\s+/, $lines2;
foreach $line2 (@line2) {
# If the unique flag is specified and intersect set is larger than 1, set the tainted flag.
if ($unique && @line2 > 1) { $tainted{$file2}->{$line2} = 1 }
else {
# Keep track of the relation from each member of the intersect set to line1
push @{$backlookup{$file2}->{$line2}->{$file1}}, $line1;
# If flag is set, show input GFF lines too
if ($showtogether) { $keep{$file1}->{$line1} = $_ }
if ($image) { $transform{$file1}->{$line1} = [@f[0,3,4],@pair[0..2]] }
}
}
if (defined $sort || $mergeSet) {
# Initially, %set, $set, $sets,and $thisset are all undefined.
# $thisset = contains the intersection set number for line2
# %set = hash that relates each line2 to an intersection set
# $sets = stores the number of intersection sets found
foreach $line2 (@line2) {
$thisset = $set{$file2}->{$line2};
$file2line2 = "$file2\t$line2";
# Note that the $set scalar will always be undefined
# on the initial pass thru this loop.
if (defined $set) {
if (defined $thisset) {
if ($thisset != $set) {
foreach (@{$members[$thisset]}) {
($thisfile2,$thisline2) = split /\t/;
$set{$thisfile2}->{$thisline2} = $set;
}
undef $members[$thisset];
}
} else {
$set{$file2}->{$line2} = $set;
undef $gfftext[$set]->{$file2line2};
}
} else {
if (defined $thisset) {
$set = $thisset;
} else {
$set = $sets++;
$set{$file2}->{$line2} = $set;
$gfftext[$set]->{$file2line2} = undef; # mark a place in this hash
}
}
}
}
} # while ($file2, $line2)
} # while (<file1>)
close file1;
}
# Merge intersect set lines into one line per set.
if ($mergeSet && $self)
{
@file2list = keys %set; #print "file2list=[@file2list]\n";
$file2 = $file2list[0];
@line2list = keys %{$set{$file2}}; #print "line2list=[@line2list]\n";
@set2list = values %{$set{$file2}}; #print "set2list=[@set2list]\n";
$pSetNum = undef;
foreach $setNum (sort bynum @set2list)
{
if (defined($pSetNum) && $setNum == $pSetNum) { next;}
$minStart = undef; $maxEnd = undef;
foreach $lineNum (@line2list)
{
if ($set{$file2}->{$lineNum} == $setNum)
{
$line = $lineText{$lineNum};
@fields = split /\t/, $line;
$start = $fields[3];
$end = $fields[4];
$name = $fields[0];
$strand = $fields[6];
if (defined($minStart))
{
if ($start < $minStart) {$minStart = $start;}
}
else {$minStart = $start;}
if (defined($maxEnd))
{
if ($end > $maxEnd) {$maxEnd = $end;}
}
else {$maxEnd = $end;}
}
}
print "$name\t.\t.\t$minStart\t$maxEnd\t.\t$strand\t.\t$setNum\n";
$pSetNum = $setNum;
}
exit;
}
while (($file2,$line2ref) = each %backlookup) {
unless (open file2, $file2) {
warn "$0: couldn't open $file2: $!\n";
next;
}
@line2 = sort {$a<=>$b} keys %$line2ref; # sort in ascending order
$line2 = 0;
while (@line2 && defined($_ = <file2>)) {
if (++$line2 == $line2[0] && !$tainted{$file2}->{shift @line2}) {
if (defined $sort) {
$set = $set{$file2}->{$line2};
@f = split /\t/;
$gfftext[$set]->{"$file2\t$line2"} = $_;
if (++$count[$set] == keys %{$gfftext[$set]}) {
%member = ();
while (($member,$gfftext) = each %{$gfftext[$set]}) { $member{$gfftext} = $member }
#warn "Set $set:\n";
#while (($member,$gfftext) = each %{$gfftext[$set]}) { warn " member=\"$member\" gfftext=$gfftext" }
$maxgfftext = (sort $sort keys %member)[0];
#warn " maxmember=\"$member{$maxgfftext}\"\n";
($maxfile2,$maxline2) = split /\t/, $member{$maxgfftext};
show($maxfile2,$maxline2,$maxgfftext);
undef $gfftext[$set];
}
}
else {
show($file2,$line2,$_);
}
}
}
close file2;
}
sub show {
my ($file2,$line2,$gfftext) = @_;
my $fileref = $backlookup{$file2}->{$line2};
if ($showtogether) {
while (($file1,$lineref1) = each %$fileref) {
foreach $line1 (@$lineref1) { print $keep{$file1}->{$line1} }
}
}
my @f = split /\t/, $gfftext;
chomp $f[8];
if ($image) {
while (($file1,$lineref1) = each %$fileref) {
my $line1;
foreach $line1 (@$lineref1) {
my @transform = @{$transform{$file1}->{$line1}};
my $dir = $transform[4] > $transform[5] ? -1 : +1;
# print "file1=$file1 line1=$line1 transform=(@transform) f=(@f)\n";
my $start = ($f[3] - $transform[1]) * $dir + $transform[4];
my $end = ($f[4] - $transform[1]) * $dir + $transform[4];
my $strand = $f[6];
if ($start > $end) {
($start,$end) = ($end,$start);
if ($strand eq "+") { $strand = "-" }
elsif ($strand eq "-") { $strand = "+" }
}
print join("\t",$transform[3],$f[1],"$f[2]-homology",$start,$end,$f[5],$strand,@f[7,8]);
unless ($quiet) { print " original($file2)=$line2" }
print "\n";
}
}
} elsif ($quiet) {
print $gfftext;
} else {
while (($file1,$lineref1) = each %$fileref) {
$f[8] .= ' ' unless $f[8] =~ / $/;
$f[8] .= "intersect($file1)=(@$lineref1)";
}
print join("\t",@f)."\n";
}
if ($showtogether) { print "\n" }
}
sub bystartpos {
my @a = split /\t/, $a;
my @b = split /\t/, $b;
$b[3] <=> $a[3];
}
sub bymaxscore {
my @a = split /\t/, $a;
my @b = split /\t/, $b;
$b[5] <=> $a[5]; # descending order
}
sub byminscore {
my @a = split /\t/, $a;
my @b = split /\t/, $b;
$a[5] <=> $b[5]; # ascending order
}
sub bylen {
my @a = split /\t/, $a;
my @b = split /\t/, $b;
$b[4]-$b[3] <=> $a[4]-$a[3];
}
sub bypairdistance {
my @a = split /\t/, $a;
my @b = split /\t/, $b;
my ($apair) = ($a[8] =~ /(\d+)/);
my ($bpair) = ($b[8] =~ /(\d+)/);
abs($apair-$a[3]) <=> abs($bpair-$b[3]);
}
sub byexpr {
my @a = split /\t/, $a;
my @b = split /\t/, $b;
eval $sortexpr;
}
sub bynum { $a <=> $b }