Skip to content

Commit

Permalink
Merge pull request #22 from martinghunt/sequence_trim
Browse files Browse the repository at this point in the history
Sequence trim
  • Loading branch information
martinghunt committed Sep 8, 2014
2 parents 19f28dc + 651ecc4 commit 5f6a9d5
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 0 deletions.
33 changes: 33 additions & 0 deletions fastaq/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,39 @@ def search_for_seq(infile, outfile, search_string):
utils.close(fout)


def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50):
trim_seqs = {}
file_to_dict(to_trim_file, trim_seqs)
trim_seqs = [x.seq for x in trim_seqs.values()]

seq_reader_1 = sequences.file_reader(infile_1)
seq_reader_2 = sequences.file_reader(infile_2)
f_out_1 = utils.open_file_write(outfile_1)
f_out_2 = utils.open_file_write(outfile_2)

for seq_1 in seq_reader_1:
try:
seq_2 = next(seq_reader_2)
except:
utils.close(f_out)
raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')

for seq in seq_1, seq_2:
for trim_seq in trim_seqs:
if seq.seq.startswith(trim_seq):
seq.seq = seq.seq[len(trim_seq):]
break

if len(seq_1) >= min_length and len(seq_2) >= min_length:
print(seq_1, file=f_out_1)
print(seq_2, file=f_out_2)


utils.close(f_out_1)
utils.close(f_out_2)



def translate(infile, outfile, frame=0):
seq_reader = sequences.file_reader(infile)
fout = utils.open_file_write(outfile)
Expand Down
12 changes: 12 additions & 0 deletions fastaq/tests/data/tasks_test_sequence_trim_1.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>1/1
TRIM1GCTCGAGCT
>2/1
TRIM1AGCTAGCTAG
>3/1
CGCTAGCTAG
>4/1
TRIM2AGCTAGCTAG
>5/1
AGCTAGCTAG
>6/1
TRIM4AGCTAGCTAG
8 changes: 8 additions & 0 deletions fastaq/tests/data/tasks_test_sequence_trim_1.trimmed.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>3/1
CGCTAGCTAG
>4/1
AGCTAGCTAG
>5/1
AGCTAGCTAG
>6/1
AGCTAGCTAG
12 changes: 12 additions & 0 deletions fastaq/tests/data/tasks_test_sequence_trim_2.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>1/2
TRIM1ACGTACGTAC
>2/2
TRIM2ACGTAGTGA
>3/2
ACGCTGCAGTCAGTCAGTAT
>4/2
TRIM3CGATCGATCG
>5/2
TRIM3CGATCGATCG
>6/2
CGATCGATCG
8 changes: 8 additions & 0 deletions fastaq/tests/data/tasks_test_sequence_trim_2.trimmed.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>3/2
ACGCTGCAGTCAGTCAGTAT
>4/2
CGATCGATCG
>5/2
CGATCGATCG
>6/2
CGATCGATCG
8 changes: 8 additions & 0 deletions fastaq/tests/data/tasks_test_sequences_to_trim.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>1
TRIM1
>2
TRIM2
>3
TRIM3
>4
TRIM4
17 changes: 17 additions & 0 deletions fastaq/tests/tasks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,23 @@ def test_search_for_seq(self):
os.unlink(tmp)


class TestSequenceTrim(unittest.TestCase):
def test_sequence_trim(self):
'''Test sequence_trim'''
tmp1 = 'tmp.trimmed_1.fa'
tmp2 = 'tmp.trimmed_2.fa'
in1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.fa')
in2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.fa')
to_trim = os.path.join(data_dir, 'tasks_test_sequences_to_trim.fa')
expected1 = os.path.join(data_dir, 'tasks_test_sequence_trim_1.trimmed.fa')
expected2 = os.path.join(data_dir, 'tasks_test_sequence_trim_2.trimmed.fa')
tasks.sequence_trim(in1, in2, tmp1, tmp2, to_trim, min_length=10)
self.assertTrue(filecmp.cmp(expected1, tmp1))
self.assertTrue(filecmp.cmp(expected2, tmp2))
os.unlink(tmp1)
os.unlink(tmp2)


class TestTranslate(unittest.TestCase):
def test_translate(self):
'''Test translate works in each frame'''
Expand Down
23 changes: 23 additions & 0 deletions scripts/fastaq_sequence_trim
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3

import argparse
from fastaq import tasks

parser = argparse.ArgumentParser(
description = 'Trims sequences off the start of all sequences in a pair of fasta/q files, whenever there is a perfect match. Only keeps a read pair if both reads of the pair are at least a minimum length after any trimming',
usage = '%(prog)s [options] <fasta/q 1 in> <fastaq/2 in> <out 1> <out 2> <trim_seqs>')
parser.add_argument('--min_length', type=int, help='Minimum length of output sequences [%(default)s]', default=50, metavar='INT')
parser.add_argument('infile_1', help='Name of forward fasta/q file to be trimmed', metavar='fasta/q 1 in')
parser.add_argument('infile_2', help='Name of reverse fasta/q file to be trimmed', metavar='fasta/q 2 in')
parser.add_argument('outfile_1', help='Name of output forward fasta/q file', metavar='out_1')
parser.add_argument('outfile_2', help='Name of output reverse fasta/q file', metavar='out_2')
parser.add_argument('trim_seqs', help='Name of fasta/q file of sequences to search for at the start of each input sequence', metavar='trim_seqs')
options = parser.parse_args()
tasks.sequence_trim(
options.infile_1,
options.infile_2,
options.outfile_1,
options.outfile_2,
options.trim_seqs,
min_length=options.min_length
)

0 comments on commit 5f6a9d5

Please sign in to comment.