Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add STRait Razor output as input #17

Merged
merged 6 commits into from
Apr 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion lusSTR/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ def format_subparser(subparsers):
help='file to which output will be written; default is terminal (stdout)'
)
cli.add_argument(
'input', help='UAS Sample Details Report (in .xlsx format).'
'input',
help='Input is either a single file (UAS Sample Details Report, in .xlsx format) or a '
'directory of STRait Razor output files. If input is the UAS Sample Details Report '
'(in .xlsx format), use of the --uas flag is required. If STRait Razor output is '
'used, the name of the provided directory will be used as the Analysis ID in the '
'final annotation table. Output files within the directory should be named as such: '
'SampleID_STRaitRazor.txt (e.g. A001_STRaitRazor.txt).'
)
cli.add_argument(
'--uas', action='store_true',
help='Use if sequences have been previously run through the ForenSeq UAS.'
)


Expand Down
67 changes: 55 additions & 12 deletions lusSTR/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,69 @@

import lusSTR
import argparse
import os
import pandas as pd
import re
import sys


def strait_razor_concat(input_dir):
'''
Function to prepare STRait Razor output for use in the 'annotate' lusSTR command.

STRait Razor outputs individual files per sample. The function formats each file
appropriately ('Locus', 'Total Reads', 'Sequence', 'SampleID') and then concatenates
all samples into one large file.
'''
loci_list = [
'CSF1PO', 'D10S1248', 'D12S391', 'D13S317', 'D16S539', 'D17S1301', 'D18S51', 'D19S433',
'D1S1656', 'D20S482', 'D21S11', 'D22S1045', 'D2S1338', 'D2S441', 'D3S1358', 'D4S2408',
'D5S818', 'D6S1043', 'D7S820', 'D8S1179', 'D9S1122', 'FGA', 'PentaD', 'PentaE', 'TH01',
'TPOX', 'vWA'
]
myfiles = os.listdir(input_dir)
straitrazorcomp = pd.DataFrame()
for filename in sorted(myfiles):
name = re.sub("_STRaitRazor.txt", "", filename)
file = pd.read_table(input_dir + filename, sep="\t", header=None)
file.columns = ['Locus_allele', 'Length', 'Sequence', 'Forward_Reads', 'Reverse_Reads']
file[['Locus', 'Allele']] = file.Locus_allele.str.split(":", expand=True)
filtered_file = file[file['Locus'].isin(loci_list)]
filtered_file['Total_Reads'] = (
filtered_file['Forward_Reads'] + filtered_file['Reverse_Reads']
)
filtered_file['SampleID'] = name
final_file = filtered_file.loc[:, ['Locus', 'Total_Reads', 'Sequence', 'SampleID']]
straitrazorcomp = straitrazorcomp.append(final_file)
straitrazorcomp.columns = ['Locus', 'Total_Reads', 'Sequence', 'SampleID']
return straitrazorcomp


def main(args):
'''
Script to convert UAS Sample Details Report (.xlsx format) to a more user-friendly
format. Also removes the Amelogenin locus and extract relevant information (e.g.
Sample ID, Project ID and Analysis ID).
Script to convert either the UAS Sample Details Report (.xlsx format using the --uas flag)
or STRait Razor output to a more user-friendly format. Also removes the Amelogenin locus
and extract relevant information (e.g. Sample ID, Project ID and Analysis ID).
'''
if args.uas:
file = pd.read_excel(io=args.input, sheet_name=0)
well_index = file[
file["Sample Autosomal STR Report"] == "Coverage Information"].index.tolist()
results_newdf = file[(well_index[0] + 2):]
results_newdf.columns = file.iloc[(well_index[0] + 1)]
results_filt = results_newdf[results_newdf.Locus != "Amelogenin"]
results_final = results_filt[['Locus', 'Reads', 'Repeat Sequence']]
results_final['SampleID'] = file.iloc[1, 1]
results_final['Project'] = file.iloc[2, 1]
results_final['Analysis'] = file.iloc[3, 1]
else:
results_final = strait_razor_concat(args.input)
path = args.input
analysisID = path.rstrip(os.sep)
analysisID_final = os.path.basename(analysisID)
results_final['Project'] = "NA"
results_final['Analysis'] = analysisID_final

file = pd.read_excel(io=args.input, sheet_name=0)
well_index = file[file["Sample Autosomal STR Report"] == "Coverage Information"].index.tolist()
results_newdf = file[(well_index[0] + 2):]
results_newdf.columns = file.iloc[(well_index[0] + 1)]
results_filt = results_newdf[results_newdf.Locus != "Amelogenin"]
results_final = results_filt[['Locus', 'Reads', 'Repeat Sequence']]
results_final['SampleID'] = file.iloc[1, 1]
results_final['Project'] = file.iloc[2, 1]
results_final['Analysis'] = file.iloc[3, 1]
output_file = sys.stdout
if args.out is not None:
results_final.to_csv(args.out, index=False)
Loading