-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
111 lines (100 loc) · 4.19 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys
from enum import Enum, auto
# Enum for the different cases of overlap
class Overlap(Enum):
FULL = auto()
PARTIAL = auto()
ZERO = auto()
class Clustering:
def __init__(self, ov_min=10, ov_min_r=0.2):
self._cluster_x1_min = []
self._cluster_x1_max = []
self._cluster_x2_min = []
self._cluster_x2_max = []
self._nhits_in_cluster = []
self._cluster_labels = []
# If edges are similar within max of those two, match will be included in cluster:
self._overlap_min = ov_min # number of residues
self._overlap_min_ratio = ov_min_r # ratio
def _clean(self):
self._cluster_x1_min = []
self._cluster_x1_max = []
self._cluster_x2_min = []
self._cluster_x2_max = []
self._nhits_in_cluster = []
self._cluster_labels = []
# Given lists of hit beginings and ends, fill up the cluster lists data members,
# as well as the list of cluster labels that each hit belongs to.
def fill_clusters(self, x1l, x2l):
self._clean()
for x1, x2 in zip(x1l, x2l):
icl = self._cluster_id(x1,x2)
self._cluster_labels.append(icl)
# List of cluster labels for each hit
@property
def clabels(self):
return self._cluster_labels
# Lists of cluster limits
@property
def cluster_x1(self):
return self._cluster_x1_min
@property
def cluster_x2(self):
return self._cluster_x2_max
# List of number of hits in each cluster
@property
def nhits(self):
return self._nhits_in_cluster
# Number of clusters
@property
def ncl(self):
return len(self._nhits_in_cluster)
# Check if hit with limits [x1,x2] overlaps with any of the existing clusters.
# If yes, re-calculate cluster limits, increment its hit count, and return the cluster id.
# If no, add new cluster and return the new cluster id.
def _cluster_id(self, x1, x2):
for i in range(self.ncl):
overlaps = self._overlap(x1, x2, i)
if overlaps==Overlap.FULL:
self._cluster_x1_min[i] = min(self._cluster_x1_min[i],x1)
self._cluster_x1_max[i] = max(self._cluster_x1_max[i],x1)
self._cluster_x2_min[i] = min(self._cluster_x2_min[i],x2)
self._cluster_x2_max[i] = max(self._cluster_x2_max[i],x2)
self._nhits_in_cluster[i] += 1
return i
self._cluster_x1_min.append(x1)
self._cluster_x1_max.append(x1)
self._cluster_x2_min.append(x2)
self._cluster_x2_max.append(x2)
self._nhits_in_cluster.append(1)
return self.ncl-1
# Check if hit with limits [x1,x2] belongs to the index-th cluster:
# The cluster low and high limits are actually ranges,
# (x1cl_min, x1cl_max) for low, (x2cl_min, x2cl_max) for high.
# The hit belongs to the cluster if BOTH its ends overlap the respective cluster
# ends within the required percentage tolerance:
# x1 within (x1cl_min-overlap_min, x1cl_max+overlap_min) AND
# x2 within (x2cl_min-overlap_min, x2cl_max+overlap_min)
# Note that overlap_min is calculated differently for "inside" and "outside"
# the cluster (min and max cluster length).
def _overlap(self, x1, x2, index):
# Min and max cluster edges limits
x1cl_min = self._cluster_x1_min[index]
x1cl_max = self._cluster_x1_max[index]
x2cl_min = self._cluster_x2_min[index]
x2cl_max = self._cluster_x2_max[index]
# Min and max cluster length
dx_min = float(x2cl_min-x1cl_max)
dx_max = float(x2cl_max-x1cl_min)
# Min and max cluster edges overlap limits
lo_min = x1cl_min-max(self._overlap_min_ratio*dx_max,self._overlap_min)
lo_max = x1cl_max+max(self._overlap_min_ratio*dx_min,self._overlap_min)
hi_min = x2cl_min-max(self._overlap_min_ratio*dx_min,self._overlap_min)
hi_max = x2cl_max+max(self._overlap_min_ratio*dx_max,self._overlap_min)
# Cluster edges overlap flags
ov_lo = x1>lo_min and x1<lo_max
ov_hi = x2>hi_min and x2<hi_max
if ov_lo and ov_hi:
return Overlap.FULL
else:
return Overlap.ZERO