-
Notifications
You must be signed in to change notification settings - Fork 9
/
paprica-mt_run.py
216 lines (160 loc) · 6.89 KB
/
paprica-mt_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
help_string = """
Created on Mon Feb 01 16:50:01 2016
@author: Jeff Bowman, bowmanjs@ldeo.columbia.edu
paprica is licensed under a Creative Commons Attribution-NonCommercial
4.0 International License. IF you use any portion of paprica in your
work please cite:
Bowman, Jeff S., and Hugh W. Ducklow. "Microbial Communities Can Be Described
by Metabolic Structure: A General Framework and Application to a Seasonally
Variable, Depth-Stratified Microbial Community from the Coastal West Antarctic
Peninsula." PloS one 10.8 (2015): e0135868.
If your analysis makes specific use of pplacer, Infernal, or pathway-tools
please make sure that you also cite the relevant publications.
CALL AS:
python paprica-mt_run.py [OPTIONS]
OPTIONS:
-i: Input fasta or fasta.gz. If you have PE reads you would enter two
files, like paprica-mt_run.py -i file1.fasta.gz file2.fasta.gz...
-o: Prefix for output files.
-ref_dir: Name of the directory containing the paprica database.
-pgdb_dir: The location where pathway-tools stores PGDBs. Only necessary
if pathways are predicted.
-pathways: T or F, whether pathways should be predicted. This can take a
long time.
-t: The number of threads for bwa to use.
"""
executable = '/bin/bash' # shell for executing commands
import sys
import subprocess
import os
import gzip
import re
import pandas as pd
## Define a stop function for diagnostic use only.
def stop_here():
stop = []
print('Manually stopped!')
print(stop[1])
## Identify directory locations.
cwd = os.getcwd() + '/' # The current working directory.
if len(sys.argv) == 1:
paprica_path = '/volumes/hd2/jeff/paprica/' # The location of the paprica scripts during testing.
else:
paprica_path = os.path.dirname(os.path.realpath(__file__)) + '/' # The location of the actual paprica scripts.
## Read in command line arguments.
command_args = {}
for i,arg in enumerate(sys.argv):
if arg.startswith('-'):
arg = arg.strip('-')
try:
command_args[arg] = sys.argv[i + 1]
except IndexError:
command_args[arg] = ''
if 'h' in list(command_args.keys()):
print(help_string)
quit()
## Provide input switches for testing.
if 'i' not in list(command_args.keys()):
query = ['test_mt.fasta.gz']
else:
query = command_args['i']
query = query.split()
if 'o' not in list(command_args.keys()):
name = 'test_mt'
else:
name = command_args['o']
if 'ref_dir' not in list(command_args.keys()):
ref_dir = 'ref_genome_database/paprica-mgt.database'
else:
ref_dir = command_args['ref_dir']
if ref_dir[-1] != '/':
ref_dir = ref_dir + '/'
ref_dir = ref_dir + 'paprica-mgt.database'
if 'pathways' not in list(command_args.keys()):
pathways = 'F'
else:
pathways = command_args['pathways']
if 'pgdb_dir' not in list(command_args.keys()):
pgdb_dir = '/volumes/hd2/ptools-local/user/'
else:
pgdb_dir = command_args['pgdb_dir']
if 't' not in list(command_args.keys()):
threads = 72
else:
threads = command_args['t']
## Define path to reference directory.
ref_dir_path = paprica_path + ref_dir
if ref_dir_path.endswith('/') == False:
ref_dir_path = ref_dir_path + '/'
#%% Use BWA to search the query against the paprica-mt database
if len(query) == 1:
bwa_aln = subprocess.Popen('bwa mem ' \
+ '-t ' + str(threads) + ' ' \
+ paprica_path + ref_dir + '/paprica-mt.fasta.gz ' \
+ cwd + query[0] + ' > ' \
+ cwd + name + '.sam', shell = True, executable = executable)
bwa_aln.communicate()
## Option for PE reads.
if len(query) > 1:
bwa_aln = subprocess.Popen('bwa mem ' \
+ '-t ' + str(threads) + ' ' \
+ paprica_path + ref_dir + '/paprica-mt.fasta.gz ' \
+ cwd + query[0] + \
+ ' ' + cwd + query[1] + ' > ' \
+ cwd + name + '.sam', shell = True, executable = executable)
bwa_aln.communicate()
## gzip the sam file to save space.
gz = subprocess.Popen('gzip -f ' + cwd + name + '.sam', shell = True, executable = executable)
gz.communicate()
#%% Iterate across sam file, tallying the number of hits to each reference that appears in the results.
## This section should be parallelized. But not clear how to do parallel read
## from file. Best strategy for parallelization is probably course grained by
## running on multiple input files at once.
prot_counts = pd.Series()
prot_counts.name = 'n_hits'
i = 0
f = 0
with gzip.open(cwd + name + '.sam.gz', 'rt') as sam:
for line in sam:
if line.startswith('@') == False:
i = i + 1
line = line.split('\t')
## For mapped reads, add to tally for the reference sequence. These are selected based on
## bitwise flag in the SAM file format and should only include the primary alignment:
## 1: template having multiple segments in sequencing
## 2: each segment properly aligned according to the aligner
## 16: SEQ being reverse complemented
if line[1] not in ['4']:
rname = line[2]
f = f + 1
try:
prot_counts[rname] = prot_counts[rname] + 1
except KeyError:
prot_counts[rname] = 1
print('tallying hits for', name + ':', 'hits =', f, 'out of', i, 'flag =', line[1])
## Add information from paprica-mt.ec.csv.
prot_unique_cds_df = pd.read_csv(paprica_path + ref_dir + '/paprica-mt.ec.csv.gz', header = 0, index_col = 0)
prot_counts = prot_counts.reindex(prot_unique_cds_df.index)
prot_unique_cds_df = pd.concat([prot_unique_cds_df, prot_counts], axis = 1)
prot_unique_cds_df.dropna(subset = ['n_hits'], inplace = True)
prot_unique_cds_df['length_cds'] = prot_unique_cds_df.translation.str.len() # CDS length, would be nice if this was precomputed
## Add column flagging whether CDS is unique or not. CDS that are
## not unique should not be used for taxonomic binning, or for
## genome-level gene expression analysis, as it is ambiguous as to
## which genome a mapped transcript belongs.
prot_unique_cds_df['unique'] = 'N'
prot_unique_cds_df.loc[prot_unique_cds_df.cds_n_occurrences == 1, ('unique')] = 'Y'
## Write out the final csv file.
print('writing output csv:', cwd + name + '.tally_ec.csv...')
columns_out = ['genome', 'domain', 'tax_name', 'EC_number', 'product', 'length_cds', 'n_hits', 'unique']
prot_unique_cds_df.to_csv(cwd + name + '.tally_ec.csv', columns = columns_out)
## Write out report file.
with open(cwd + name + '.paprica-mt_report.txt', 'w') as report:
print('file', query, file=report)
print('n_reads', i, file=report)
print('n_hits', f, file=report)
print('f_hits', float(f)/i, file=report)
print('n_genomes', len(prot_unique_cds_df.genome.value_counts()), file=report)
print('done')