-
Notifications
You must be signed in to change notification settings - Fork 0
/
BasicClasses.py
executable file
·192 lines (164 loc) · 6.86 KB
/
BasicClasses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from random import randint
import logging
class Table(object):
"""Table file class for complex indel simulation."""
__reference = ''
def __init__(self, fapath, tablepath, isvariant, num):
"""Initialize an instance.
:param fapath: A string represents the path of fasta file.
:param tablepath: A string represents the path of table file.
:param isvariant: A boolean suggests whether to generate variants.
:param num: An integer count of variants. Needless if you don't generate variants.
"""
with open(fapath, 'r') as file:
file.readline()
s = file.read()
s = s.split()
for ss in s:
self.__reference += ss
# Table file will be empty if it's not used for variants generation.
with open(tablepath, 'w') as file:
ss = ""
if isvariant:
ss = self.__genVariants(num)
file.write(ss)
def __genVariants(self, num):
"""Generate variants information.
:param num: An integer indicating number of variants.
:return: A string contains variants information.
"""
pos = 0
ss = ''
"""
Total length of reference is 1000,000.
Step value between complex indel is 100000/2/n.
TODO: Automatically set step.
"""
def __genpos():
"""Generate position of complex indel.
:return: An integer indicates the position of complex indel.
"""
return randint(600, 900)
def __genlen():
"""Generate length of deletion or insertion.
:return: An integer indicates the length.
"""
return randint(200, 800)
# Generate num complex indels for each kind of cindel.
# Totally generate 2*num complex indels.
# Note: two lines in table file provide one complex indel information.
try:
for i in range(1, 2*num + 1, 4):
# Homozygote.
pos += __genpos()
length = __genlen()
ss += (str(i) + '\t' + str(pos) + '\t-' + str(length) + '\t3' +
'\t1\t0\t' + self.__reference[pos - 1:pos - 1 + length] + '\n')
length = __genlen()
ss += (str(i+1) + '\t' + str(pos) + '\t' + str(length) + '\t3' +
'\t1\t0\t' + self.__gensequence(length) + '\n')
# Heterozygote.
pos += __genpos()
length = __genlen()
ss += (str(i+2) + '\t' + str(pos) + '\t-' + str(length) + '\t1' +
'\t1\t0\t' + self.__reference[pos - 1:pos - 1 + length] + '\n')
length = __genlen()
ss += (str(i+3) + '\t' + str(pos) + '\t' + str(length) + '\t1' +
'\t1\t0\t' + self.__gensequence(length) + '\n')
except Exception as e:
logging.exception(e)
return ss
def __gensequence(self, num):
"""Random generation of gene sequence.
:param num: An integer indicating the length of gene sequence.
:return: A string contains generated gene sequence.
"""
ss = ['A', 'G', 'C', 'T']
sq = ''
for i in range(num):
sq += ss[randint(0, 3)]
return sq
class VCF(object):
"""Complex indel information for feature collection."""
__vcindel = []
def __init__(self, tablepath, isvariant, num):
"""Initialize an instance.
:param tablepath: A string represents the path of table file.
:param isvariant: A boolean suggests whether to generate simulation data.
:param num: A integer count of simulation data we want to generate.
"""
if isvariant: # Load complex indels information from table file.
with open(tablepath, 'r') as file:
lines = file.readlines()[::2]
lines = [line.split('\t') for line in lines]
for line in lines:
self.__vcindel.append(self.__splitline(line))
else: # Randomly generate normal data, add them to complex indels.
for i in range(num):
brkpntl = randint(10000, 1000000 - 10000)
brkpntr = brkpntl + randint(500, 1500)
self.__vcindel.append([-1, brkpntl, brkpntr])
def getdata(self):
"""Return the list of all complex indels' feature.
:return: A list contains the feature of all complex indels.
"""
return self.__vcindel
def __splitline(self, line):
"""Split line from table files and transform them to the features of complex indel.
:param line: A string contains the information of complex indel.
:return: A list contains the transformed feature of complex indel.
"""
brkpntl = int(line[1])
brkpntr = int(brkpntl + abs(int(line[2])))
if int(line[3]) == 3:
label = 0 # Homozygote.
else:
label = 1 # Heterozygote.
return [label, brkpntl, brkpntr]
class CIndel(object):
"""Complex indel class.
Attributes:
label: An integer indicates complex indel's label.
brkpntl: An integer indicates the left margin of complex indel.
brkpntr: An integer indicates the right margin of complex indel.
"""
label = 0
brkpntl = 0
brkpntr = 0
def __init__(self,label, brkpntl, brkpntr):
""" Initialize an instance.
:param label: An integer indicates complex indel's label.
:param brkpntl: An integer indicates the left margin of complex indel.
:param brkpntr: An integer indicates the right margin of complex indel.
"""
# CIndel attributes
self.label = label
self.brkpntl = brkpntl
self.brkpntr = brkpntr
class Read(object):
"""Read class.
Attributes:
pos: An integer indicates the position of first read of paired-end reads.
mapq: An integer indicates the mapped quality of read to the reference sequence.
pnext: An integer indicates the position of second read of paired-end reads.
mate: A instance of Read, indicates the mate of current read
"""
pos = 0
mapq = 0
pnext = 0
mate = None
def __init__(self, pos, mapq, pnext):
"""Initialize an instance.
:param pos: An integer indicates the position of first read of paired-end reads.
:param mapq: An integer indicates the mapped quality of read to the reference sequence.
:param pnext: An integer indicates the position of second read of paired-end reads.
"""
self.pos = pos
self.mapq = mapq
self.pnext = pnext
def setmate(self, mate):
""" Set paired-end read's mate.
:param mate: A instance of Read, indicates the mate of current read
:return: None
"""
self.mate = mate