forked from fechitheodore/GenomeGit3.1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_functions.py
803 lines (743 loc) · 36.6 KB
/
parse_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
#!/usr/bin/env python
# PYTHON FUNCTIONS USED FOR FILE PARSING
# Make the imports
import os
import re
import sys
from subprocess import Popen
def parse_cntg(seq, number, map_file, folder, seq_index):
"""
Define the parse contig function. This function will take a nucleotide sequence,
the number of the sequence count, the map file, the current folder and the
sequence index as input parameters. It will create a file for each contig,
but if the 250.000 nt threshold is reached it will create a folder contaiing equally
sized files. Additionally it has to return the updated index.
"""
# Save the input arguments
current_nucleotides = seq
i = number
map = map_file
current_dir = folder
index = seq_index
# If the file is empty, this means the sequence started with Ns (this may occur during scaffold parsing).
# In this case, do not write a contig file (it would be empty)
if(len(current_nucleotides) == 0):
pass
# If the length of the contig is smaller than 250.000 nt, write a single file
elif (len(current_nucleotides) < 250000):
# Write the file
output_file = open("{}/Sequence{}".format(current_dir, str(i)), "w")
output_file.write(current_nucleotides)
# output_file.close()
# Write in the map file
map.write("{}/Sequence{}\t{} {}\n".format(current_dir, str(i), str(index),
str(index + len(current_nucleotides) - 1)))
index = index + len(current_nucleotides)
# If this contig is bigger than 250.000 nt, split it into files of 100.000 nt each
else:
# Divide the current contig into chunks of 100.000 nt
chunks = [current_nucleotides[y:y + 100000]
for y in range(0, len(current_nucleotides), 100000)]
# Create a new folder to store those chunks
current_dir = "{}/Sequence{}".format(current_dir, str(i))
os.mkdir(current_dir)
# Create chunk_count
chunk_count = 0
# Write each chunk in a separate file
for chunk in chunks:
chunk_count = chunk_count + 1
# Write the file
output_file = open(
"{}/Sequence{}".format(current_dir, str(chunk_count)), "w")
output_file.write(chunk)
# output_file.close()
# Write in the map file
map.write("{}/Sequence{}\t{} {}\n".format(current_dir, str(chunk_count),
str(index), str(index + len(chunk) - 1)))
index = index + len(chunk)
# Return the updated index
return index
def parse_scf(seq, folder, map_file, seq_index):
"""
Define the parse scaffold function. This function will take a nucleotide sequence, the map file,
the current folder and the sequence index as input parameters.
It will create a folder for each scaffold, and pass each contig to the parse_cntg function.
It has to return an updated index.
"""
# Save the arguments
current_nucleotides = seq
current_dir = folder
map = map_file
# Store all the contigs in one list
contig_list = re.split("N+", current_nucleotides)
# Store all the gaps in one list and remove any empty element
gap_list = re.split("[^N]+", current_nucleotides)
gap_list = test = list(filter(None, gap_list))
# Initiate the index, the count of subsequences and the current subsequence
subseq_count = 0
index = seq_index
current_subsequence = ""
# Loop through all the contigs in the list
for x in range(0, len(contig_list)):
# Add the current contig into the current subsequence
current_subsequence = current_subsequence + contig_list[x]
# If this is the last contig, write the current subsequence into a file
if(x == len(contig_list) - 1):
subseq_count = subseq_count + 1
index = parse_cntg(seq=current_subsequence, number=subseq_count,
map_file=map, folder=current_dir, seq_index=index)
# If the next gap is smaller than 25, iterate to the next contig. DONT FORGET TO ADD THOSE Ns!!!
elif(len(gap_list[x]) < 25):
current_subsequence = current_subsequence + gap_list[x]
continue
# Otherwise store the current subsequence in a new file
else:
subseq_count = subseq_count + 1
index = parse_cntg(seq=current_subsequence, number=subseq_count,
map_file=map, folder=current_dir, seq_index=index)
map.write("N\t" + str(index) + " "
+ str(index + len(gap_list[x]) - 1) + "\n")
index = index + len(gap_list[x])
current_subsequence = ""
# Return the updated index
return index
def parse_super(seq_list, gap_list, folder, map_file):
"""
Create a method to parse the super subsequences. This function will take a nucleotide sequence,
the map file, the current folder and the sequence index as input parameters.
It will create a folder for each supersubseq, and pass each scaffold or contig to their
respective parsing function. It has to return an updated index.
"""
# Save the arguments
super_subseq_list = seq_list
super_gaps_list = gap_list
current_dir = folder
map = map_file
# Subsequence count is 0 and index is 1
subseq_count = 0
index = 1
# For each of the super subsequences, determine their class and act accordingly
for x in range(0, len(super_subseq_list)):
subseq_count = subseq_count + 1
# If the super subsequence is empty (this will occur when the sequence starts or ends with Ns) dont do anything
# only update the map if necessary
if(len(super_subseq_list[x]) == 0):
# Unless this empty supergap is at the end (this would mean the sequence ends with Ns), update the map file
if(x != len(super_gaps_list)):
map.write("N\t{} {}\n".format(str(index), str(
index + len(super_gaps_list[x]) - 1)))
index = index + len(super_gaps_list[x])
# If this sequence is a contig, store its content in a new file and write the roadmap
elif(IsContig(super_subseq_list[x])):
# Parse the contig
index = parse_cntg(
seq=super_subseq_list[x], number=subseq_count, map_file=map, folder=current_dir, seq_index=index)
# Only if this is not the last super subsequence, write in the map file the Ns
if(x != len(super_gaps_list)):
# Write in the map file
map.write("N\t{} {}\n".format(str(index), str(
index + len(super_gaps_list[x]) - 1)))
index = index + len(super_gaps_list[x])
# If this sequence is a scaffold, store its contigs in a new folder
else:
# Create a new folder to allocate the scaffold
current_subfolder = "{}/Sequence{}".format(
current_dir, str(subseq_count))
os.mkdir(current_subfolder)
# Parse the scaffold
index = parse_scf(
seq=super_subseq_list[x], folder=current_subfolder, map_file=map, seq_index=index)
# Only if this is not the last super subsequence, write in the map file the N gap
if(x != len(super_gaps_list)):
# Write the map file
map.write("N\t{} {}\n".format(str(index), str(
index + len(super_gaps_list[x]) - 1)))
index = index + len(super_gaps_list[x])
def parse_nucleotides(seq, number, map_file):
"""
Create a parse nucleotides function. This function parses the nucleotides of a sequence using parse_cntg,
parse_scf and parse_super.
"""
# Store the arguments
current_nucleotides = seq
i = number
map = map_file
# If this sequence is a contig, store its content in a new file and write the roadmap
if(IsContig(current_nucleotides)):
# Write the info about the current sequence in the map file
map.write(">Sequence" + str(i) + "\t"
+ str(len(current_nucleotides)) + "\n")
# Parse the contig
parse_cntg(seq=current_nucleotides, number=i,
map_file=map, folder=".", seq_index=1)
# If this sequence is a scaffold, store its contigs in a new folder
elif(IsScaffold(current_nucleotides)):
# Create a directory and write the map
current_dir = "./Sequence" + str(i)
os.mkdir(current_dir)
map.write(">Sequence" + str(i) + "\t"
+ str(len(current_nucleotides)) + "\n")
# Parse the scaffold
parse_scf(seq=current_nucleotides, folder=current_dir,
map_file=map, seq_index=1)
# If it is not a contig or a scaffold, it is a chromosome
else:
# Create a directory and write the map
current_dir = "./Sequence" + str(i)
os.mkdir(current_dir)
map.write(">Sequence" + str(i) + "\t"
+ str(len(current_nucleotides)) + "\n")
# Store all the subsequences separated by N-10000+ gaps in a list
pattern = "N" * 10000
super_subseq_list = re.split(pattern + "+", current_nucleotides)
# Store all the N-10000+ gaps in one list, remove empty elements and those which are lower than 10.000
super_gaps_list = re.split("[^N]+", current_nucleotides)
super_gaps_list = filter(None, super_gaps_list)
super_gaps_list = [g for g in super_gaps_list if len(g) >= 10000]
# If there are no super gaps, the whole sequence is like a large scaffold
if(len(super_gaps_list) == 0):
# All the sequence is a large scaffold
parse_scf(seq=current_nucleotides, folder=current_dir,
map_file=map, seq_index=1)
# Otherwise parse each super subsequence
else:
# Parse each of the super subsequences
parse_super(seq_list=super_subseq_list,
gap_list=super_gaps_list, folder=current_dir, map_file=map)
def IsContig(seq):
"""
Define a the contig and the scaffold classification function.
They will return true when one of these sequences is detected
"""
# Check if it has gaps bigger than N-25, in which case this is not a contig
if(seq.find("NNNNNNNNNNNNNNNNNNNNNNNNN") == -1):
return True
else:
return False
def IsScaffold(seq):
# If it has N-25 gaps and its length is lower than 1.000.000, this is a scaffold
if(len(seq) < 1000000):
return True
else:
return False
def parse_dependent_dict(DependentDict):
"""
Create a function parse dependent file to parse a the information of a dependent file contained
in a dependent dictionary into the repository
"""
# Initiate required variables
seq_count = 0
# Create a SeqID_Map file. This map file stores information about the seqID, the location of files and the count of sequences
seqID_map = open("./SeqIDs_Map.txt", "w")
# Loop through the sequences in the dependent file
for key in DependentDict.keys():
seq_count += 1
# Create subfiles for 1MB regions
# Write in the seqID map
seqID_map.write(">" + key + "\t/Sequence" + str(seq_count) + "\n")
# Create a folder for the sequence and open the first subfile
os.mkdir("./Sequence" + str(seq_count))
# Loop through the regions stored in the dictionary {seqID:{1M:[line[1:],],2M:[],}}
for region in DependentDict[key].keys():
# Write in the map
seqID_map.write("/Sequence" + str(seq_count)
+ "/Region" + str(region) + "\n")
# Open a new file for the region
output_file = open("./Sequence" + str(seq_count)
+ "/Region" + str(region), "w")
# Write the entries of the region into the file
for line in DependentDict[key][region]:
output_file.write(line)
# Close the file
output_file.close()
# Close seqID map
seqID_map.close()
def format_bytes(size):
# 2**10 = 1024
power = 2**10
n = 0
power_labels = {0 : '', 1: 'K', 2: 'M', 3: 'G', 4: 'T'}
while(size > power):
size /= power
n += 1
return str(size), power_labels[n]+'B'
def parse_dataset(dataset, input_path, size, update):
"""
Create a parse dataset function to parse a given dataset in a file into the repository structure
"""
file_size = os.path.getsize(input_path)
size, label = format_bytes(file_size)
size = str(size)
# PARSE A GENOME DATASET
# If the file to be parsed is a genome file.
if(dataset == "Genome"):
# Load the input file
input_file = open(input_path, "r")
# Initiate necessary variables
i = 0
current_nucleotides = ""
mapfile = open("Map.txt", "w")
invalid_input = True
# Loop through the lines of the file
for line in input_file:
# Only read the line if it does not start with a # (description lines can start with this)
if(line[0] != "#"):
# Remove all new lines (sequence stored as a big line)
line = line.rstrip()
# If the line starts with that symbol, this is a new sequence
if (line[0] == ">"):
# The input file is not invalid anymore
invalid_input = False
# Unless this is the first sequence, write a file with its nucleotides
if(current_nucleotides != ""):
parse_nucleotides(
seq=current_nucleotides, number=i, map_file=mapfile)
# Add one to the sequence count
i = i + 1
# Empty the current nucleotides
current_nucleotides = ""
# Otherwise store the current sequence in the current_nucleotides variable
else:
if(current_nucleotides == ""):
line_size = str(len(line))
current_nucleotides = current_nucleotides + line
# If when the parsing is finished, the input file has not been validated, raise an exception
if(invalid_input):
# Inform the user
print(
"\n\n\t***INPUT ERROR: Please make sure to input a valid FASTA format file***\n")
# Delete the invalid Genome folder
os.system("rm -r ../Genome")
# Exit the system
sys.exit()
# Parse the last sequence left out of the loop
parse_nucleotides(seq=current_nucleotides, number=i, map_file=mapfile)
# Close the map file and the input file
input_file.close()
mapfile.close()
# If it is an update it is required to update the repomap
if(update):
# Open the repository map file in read mode and append all the lines to a list
repo_list = []
with open("../RepoMap.txt", "r") as repomap:
# Loop through the lines
for line in repomap:
# Split the line
line = line.split("\t")
# If the dataset of the current file is genome, substitute the fields of the name, line size and size of the file
if("Genome" == line[1]):
line[0] = os.path.basename(input_path)
line[3] = line_size
line[4] = "{} {}\n".format(size, label)
# Store the line with or without modification
repo_list.append("\t".join(line))
# Close the file
repomap.close()
# Open the file with write permit and write all the elements of the repo list
with open("../RepoMap.txt", "w") as repomap:
for line in repo_list:
repomap.write(line)
# Close the file
repomap.close()
# Otherwise simply append a new line to the repomap
else:
# Open the repository map file in append mode
with open("../RepoMap.txt", "a") as repomap:
# Write the information corresponding with the added file
repomap.write(os.path.basename(input_path) + "\t" + dataset
+ "\t" + "./" + dataset + "\t" + line_size + "\t" + "{} {}\n".format(size, label))
# Close the file
repomap.close()
# PARSE A ANNOTATION DATASET
# If it is an annotation or variants dataset
elif(dataset == "Annotation"):
# Open the dependent file and read the first line
annotation_file = open(input_path, "r")
# Create a comment file to store the comments at the begining of the file
comment_file = open("./Comments.txt", "w")
# Initiate required variables
annotation_dict = {} # {seqID:{1M:[line[1:],],2M:[],}}
line_number = 0
# Loop through the annotation file lines
for line in annotation_file:
line_number += 1
# If the line starts with # or @; it corresponds with a comment, store it in the comment file
if(line[0] == "#"):
comment_file.write(line)
# Otherwise the line corresponds with an entry
else:
# Split the line using \t
line = line.split("\t")
# Annotation should have at least 8 fields and the fields 3 and 4 should be a number
if(len(line) < 8 or not line[3].isdigit() or not line[4].isdigit()):
# Inform the user
print(
"\n\n\t***INPUT ERROR: Invalid input file. Please make sure to provide a valid GFF file for your Annotation dataset.***")
print("\n\t The problem was raised in line number "
+ str(line_number) + " :\n\n\t" + "\t".join(line))
os.system("rm -r ../$(basename " + input_path + ")")
# Close the files and exit
comment_file.close()
annotation_file.close()
sys.exit()
# If the seqId is already a key in the dictionary, append the new line to the list (but without the seqID)
elif (line[0] in annotation_dict.keys()):
# Determine the region of the current line using the start index
region = (int(line[3]) / 1000000)
# Check if the region already has a list, if it does, append the new line to the list
if(region in annotation_dict[line[0]].keys()):
annotation_dict[line[0]][region].append(
"\t".join(line))
# Otherwise create a new list with the line
else:
annotation_dict[line[0]][region] = ["\t".join(line)]
# If this seqId is not in the dictionary, create a new list and add the current line, and update the key count
# This key count will help to know the order of the sequences when reconstructing the file (dictionaries are not sorted, so
# its entries will be shown in random order).
else:
annotation_dict[line[0]] = {}
# Determine the region of the current line using the start index
region = (int(line[3]) / 1000000)
annotation_dict[line[0]][region] = ["\t".join(line)]
# Close the files
comment_file.close()
annotation_file.close()
# Use the parse dependent dict function to parse the information contained in the resulting dictionary into the repo
parse_dependent_dict(DependentDict=annotation_dict)
# If it is an update it is required to update the repomap
if(update):
# Open the repository map file in read mode and append all the lines to a list
repo_list = []
with open("../../RepoMap.txt", "r") as repomap:
# Loop through the lines
for line in repomap:
# Split the line
line = line.split("\t")
# If the current file is has the same name, substitute the field of the size of the file with the new one
if(os.path.basename(input_path) == line[0]):
line[4] = "{} {}\n".format(size, label)
# Store the line with or without modification
repo_list.append("\t".join(line))
# Close the file
repomap.close()
# Open the file with write permit and write all the elements of the repo list
with open("../../RepoMap.txt", "w") as repomap:
for line in repo_list:
repomap.write(line)
# Close the file
repomap.close()
# Otherwise simply append a new line to the repomap
else:
# Open the repository map file in append mode
with open("../../RepoMap.txt", "a") as repomap:
# Write the information corresponding with the added file
repomap.write(os.path.basename(input_path) + "\t" + dataset + "\t" + "./"
+ dataset + "/" + os.path.basename(input_path) + "\t1\t" + "{} {}\n".format(size, label))
# Close the file
repomap.close()
# PARSE A VARIANTS DATASET
elif(dataset == "Variants"):
# Open the dependent file and read the first line
variants_file = open(input_path, "r")
# Create a comment file to store the comments at the begining of the file
comment_file = open("./Comments.txt", "w")
# Initiate required variables
variants_dict = {} # {seqID:{1M:[line[1:],],2M:[],}}
line_number = 0
# Loop through the file lines
for line in variants_file:
line_number += 1
# If the line starts with # or @; it corresponds with a comment, store it in the comment file
if(line[0] == "#"):
comment_file.write(line)
# Otherwise the line corresponds with an entry
else:
# Split the line using \t
line = line.split("\t")
# Variants should have at least 8 fields and the field 1 should be a number
if(len(line) < 8 or not line[1].isdigit()):
# Inform the user
print(
"\n\n\t***INPUT ERROR: Invalid input file. Please make sure to provide a valid VCF file for your Variants dataset.***")
print("\n\t The problem was raised in line number "
+ str(line_number) + " :\n\n\t" + "\t".join(line))
os.system("rm -r ../$(basename " + input_path + ")")
# Close the files and exit
variants_file.close()
comment_file.close()
sys.exit()
# If the seqId is already a key in the dictionary, append the new line to the list (but without the seqID)
elif (line[0] in variants_dict.keys()):
# Determine the region of the current line using the start index
region = (int(line[1]) / 1000000)
# Check if the region already has a list, if it does, append the new line to the list
if(region in variants_dict[line[0]].keys()):
variants_dict[line[0]][region].append("\t".join(line))
# Otherwise create a new list with the line
else:
variants_dict[line[0]][region] = ["\t".join(line)]
# If this seqId is not in the dictionary, create a new list and add the current line, and update the key count
# This key count will help to know the order of the sequences when reconstructing the file
# (dictionaries are not sorted, so its entries will be shown in random order).
else:
variants_dict[line[0]] = {}
# Determine the region of the current line using the start index
region = (int(line[1]) / 1000000)
# Add the line to its corresponding region
variants_dict[line[0]][region] = ["\t".join(line)]
# Close the files
comment_file.close()
variants_file.close()
# Use the parse dependent dict function to parse the information
# contained in the resulting dictionary into the repo
parse_dependent_dict(DependentDict=variants_dict)
# If this is an update it is required to update the repomap
if(update):
# Open the repository map file in read mode and append all the lines to a list
repo_list = []
with open("../../RepoMap.txt", "r") as repomap:
# Loop through the lines
for line in repomap:
# Split the line
line = line.split("\t")
# If the current file is has the same name,
# substitute the field of the size of the file with the new one
if(os.path.basename(input_path) == line[0]):
line[4] = "{} {}\n".format(size, label)
# Store the line with or without modification
repo_list.append("\t".join(line))
# Close the file
repomap.close()
# Open the file with write permit and write all the elements of the repo list
with open("../../RepoMap.txt", "w") as repomap:
for line in repo_list:
repomap.write(line)
# Close the file
repomap.close()
# Otherwise simply append a new line to the repomap
else:
# Open the repository map file in append mode
with open("../../RepoMap.txt", "a") as repomap:
# Write the information corresponding with the added file
repomap.write(os.path.basename(input_path) + "\t" + dataset + "\t" + "./"
+ dataset + "/" + os.path.basename(input_path) + "\t1\t" + "{} {}\n".format(size, label))
# Close the file
repomap.close()
# PARSE AN ALIGNMENT DATASET
elif(dataset == "Alignment"):
# If this is an update it is required to update the repomap
# Convert BAM to SAM if the input file is BAM
if ('.bam' in input_path):
samOutput_path = './' + os.path.basename(input_path)[:-3] + 'sam'
print("Converting BAM to SAM.")
Popen("samtools view -h -o {} {}".format(samOutput_path,
input_path), shell=True).wait()
print("Conversion done.")
# New path for this file is .sam instead of .bam
input_path = samOutput_path
# Change the size of the file to SAM's size
size = str(os.path.getsize(input_path) / (1024 * 1024))
if(update):
# Open the repository map file in read mode and append all the lines to a list
repo_list = []
repomap = open("../../RepoMap.txt", "r")
# Loop through the lines
for line in repomap:
# Split the line
line = line.split("\t")
# If the current file is has the same name,
# substitute the field of the size of the file with the new one
if(os.path.basename(input_path) == line[0]):
line[4] = "{} {}\n".format(size, label)
# Store the line with or without modification
repo_list.append("\t".join(line))
# Close the file
repomap.close()
# Open the file with write permit and write all the elements of the repo list
repomap = open("../../RepoMap.txt", "w")
for line in repo_list:
repomap.write(line)
# Otherwise simply append a new line to the repomap
else:
# Open the repository map file in append mode
repomap = open("../../RepoMap.txt", "a")
# Write the information corresponding with the added file
repomap.write(os.path.basename(input_path) + "\t" + dataset + "\t" + "./"
+ dataset + "/" + os.path.basename(input_path) + "\t1\t" + "{} {}\n".format(size, label))
# Create a comment file to store the comments at the begining of the file
comment_file = open("./Comments.txt", "w")
# Open the sam file
sam_file = open(input_path, "r")
# Inititate varibles
seqID = ""
seqID_dict = {} # {seqID:directory}
seq_count = 0
# Loop through the lines of the file and store the comments
for line in sam_file:
# If the line is a comment, write in the comment file
if(line[0] == "@"):
comment_file.write(line)
# Otherwise it is a read, store it
else:
# Split the line
line = line.split("\t")
# If the seqID is new (there is no directory already in place for the sequence)
if(line[2] not in seqID_dict.keys()):
# Add one to the seq_count
seq_count += 1
# Add it to the dictionary, with its count
seqID_dict[line[2]] = "Sequence" + str(seq_count)
# Create a new directory for the sequence
os.mkdir("./Sequence" + str(seq_count))
# Determine the region of the line
region = (int(line[3]) / 1000000)
# Append the line to the file of the corresponding region
repomap.close()
repomap = open("./Sequence" + str(seq_count)
+ "/Region" + str(region), "a")
repomap.write("\t".join(line))
# If the sequence ID is in the dictionary, no need to create a new directory
else:
# Determine the region of the line
region = (int(line[3]) / 1000000)
# If the region and seqID has not changed since the last line, do not close and open a new file
if(repomap.name == seqID_dict[line[2]] + "/Region" + str(region)):
repomap.write("\t".join(line))
# Otherwise it is required to open a new file
else:
# Append the line to the file of the corresponding region
repomap.close()
repomap = open(
seqID_dict[line[2]] + "/Region" + str(region), "a")
repomap.write("\t".join(line))
# Close the files
repomap.close()
sam_file.close()
comment_file.close()
# Create a map file to inform the structure of the dataset
mapfile = open("./SeqIDs_Map.txt", "w")
# Include all the sequences with their subfiles
for seqID in seqID_dict.keys():
# Write the seqID and its directory
mapfile.write(">" + seqID + "\t/" + seqID_dict[seqID] + "\n")
# Create a list of subfiles and loop through them. Write them in the map file.
# Sort the files according to their region.
region_list = sorted(os.listdir(
"./" + seqID_dict[seqID]), key=lambda x: x[6:])
for subfile in region_list:
mapfile.write("/" + seqID_dict[seqID] + "/" + subfile + "\n")
# Close the map file
mapfile.close()
# Remove the SAM file that is a result of conversion
if(os.path.isfile("./converted.txt")):
os.remove(input_path)
# RAISE AN ERROR
# Otherwise there was a problem
else:
print("\n***INTERNAL ERROR: DATASET NOT RECOGNISED: " + dataset + "\n")
# Exit the function
return
def parse_delta_file(file_path):
"""
Create function parse delta file to store the information contained in the mummer delta file
into a list with an entry for every alignment [oldID, newID, oldLength, newLength]
"""
delta_list = []
# Loop through the lines of the delta file
with open(file_path, "r") as delta:
for line in delta:
if(line[0] == ">"):
line = line[1:].split()
delta_list.append(line[:4])
# Return the list
return delta_list
def parse_snp_file(file_path):
"""
Create a parse_snp_file function to parse a snp file into a dictionary containing
the number of insertions, deletions and substitutions for each alignment
"""
# Initiate an empty dictionary {(oldID, newID):[insertions,deletions,substitutions]...}
snp_dict = {}
# Open the snp file and loop through its lines
with open(file_path, "r") as snp:
for line in snp:
# Split the line into its fields
line = line.rstrip().split("\t")
key = oldID, newID = line[10], line[11]
# If the oldID is already in the dictionary, add the current snp into the count
if(key in snp_dict.keys()):
# If it is an insertion, add one to the count
if(line[1] == "."):
snp_dict[key][0] += 1
# If it is a deletion, add one to the count
elif(line[2] == "."):
snp_dict[key][1] += 1
# Otherwise it is a substitution, add one to the count
else:
snp_dict[key][2] += 1
# If the current oldID is not a key in the dictionary, create a new entry
else:
# If it is an insertion, add one to the count
if(line[1] == "."):
snp_dict[key] = [1, 0, 0]
# If it is a deletion, add one to the count
elif(line[2] == "."):
snp_dict[key] = [0, 1, 0]
# Otherwise it is a substitution, add one to the count
else:
snp_dict[key] = [0, 0, 1]
# Return the dictionary
return snp_dict
def parse_coords_file(file_path):
"""
Create a parse_coords_file function to return a dictionary
with the information contained in the summary coords file
"""
# Create the dictionary {oldID:[lines]}
coords_dict = {}
# Open the file and loop through the lines
with open(file_path, "r") as coords:
for line in coords:
line = line.rstrip().split("\t")
key = (line[9], line[10])
# If the oldId is already in the dictionary, add the line to the current list
if(key in coords_dict.keys()):
coords_dict[key].append(line[:9])
# Otherwise create a new entry in the dictionary
else:
coords_dict[key] = [line[:9]]
# Return the dictionary
return coords_dict
# # Create lowest_highest function that will be used for parsing annotation and variants datasets.
# # It returns the highest and the lowest indexes in the output file.
# def lowest_highest(output_file, mode):
# # When the file is completely written, is necessary to write in the seqID map file the information
# # Initiate lowest and highest values (indicate the lowest and the highest index inside the file)
# lowest = 99999999999999999999999999999999
# highest = 0
# # If it is a variants file, determine if its index is higher than highest or lower than lowest, and substitute it if so.
# if(mode == "Variants"):
# # Loop through the lines of the file
# for line in output_file:
# if(int(line.split("\t")[0]) > highest):
# highest = int(line.split("\t")[0])
# if(int(line.split("\t")[0]) < lowest):
# lowest = int(line.split("\t")[0])
# # If it is an alignment file, the index is in the field 3
# if(mode == "Alignment"):
# # Loop through the lines of the file
# for line in output_file:
# if(int(line.split("\t")[3]) > highest):
# highest = int(line.split("\t")[3])
# if(int(line.split("\t")[3]) < lowest):
# lowest = int(line.split("\t")[3])
# # Same for annotation
# else:
# # Loop through the lines of the file
# for line in output_file:
# if(int(line.split("\t")[3]) > highest):
# highest = int(line.split("\t")[3])
# if(int(line.split("\t")[2]) < lowest):
# lowest = int(line.split("\t")[2])
# # Return results
# return [str(lowest), str(highest)]