Separate output (#43)

* Updated README and test files with new col names * Fixed bug with combining reads for sex chr STRs * Updated test files and test for combining reads * Updated README * Added code to remove amelogenin sequences in annotate * format command can take in single STRaitRazor file * Updated cli descriptions * Updated README * Initial commit * No longer remove SNPs with missing data * updated annot script * updated tests to accomodate not removing missing data * Updated tests and added test for separating output files * mkdir change Co-authored-by: Rebecca Mitchell <rebecca.mitchell@nbacc.dhs.gov>
bioforensics · Aug 20, 2021 · 991761c · 991761c
1 parent 4eaab0a
commit 991761c
Show file tree

Hide file tree

Showing 10 changed files with 499 additions and 307 deletions.
diff --git a/lusSTR/annot.py b/lusSTR/annot.py
@@ -142,7 +142,20 @@ def sort_table(table):
     return sorted_table
 
 
+def indiv_files(table, input_dir, ext):
+    output_dir = f'Separated_lusstr_Files/{input_dir}'
+    try:
+        os.mkdir(output_dir)
+    except FileExistsError:
+        pass
+    for samp in table['SampleID'].unique():
+        new_df = table[table['SampleID'] == samp]
+        new_df.to_csv(f'{output_dir}/{samp}{ext}', sep='\t', index=False)
+
+
 def main(args):
+    if args.separate and os.path.exists('Separated_lusstr_Files') is False:
+        os.mkdir('Separated_lusstr_Files')
     output_name = os.path.splitext(args.out)[0]
     input_name = os.path.splitext(args.input)[0]
     autosomal_final_table, autosomal_flank_table, columns = format_table(
@@ -159,22 +172,36 @@ def main(args):
             if args.combine:
                 if not sex_final_table.empty:
                     sex_final_table = combine_reads(sex_final_table, columns)
-                sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
+                if args.separate:
+                    indiv_files(sex_final_table, input_name, '_sexloci.txt')
+                else:
+                    sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
             else:
+                if args.separate:
+                    indiv_files(sex_final_table, input_name, '_sexloci_no_combined_reads.txt')
                 sex_final_table.to_csv(
                     f'{output_name}_sexloci_no_combined_reads.txt', index=False
                 )
         else:
-            sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
+            if args.separate:
+                indiv_files(sex_final_table, input_name, '_sexloci.txt')
+            else:
+                sex_final_table.to_csv(f'{output_name}_sexloci.txt', sep='\t', index=False)
     if not args.uas:
         autosomal_flank_table.to_csv(f'{output_name}_flanks_anno.txt', sep='\t', index=False)
         if args.combine:
             if not autosomal_final_table.empty:
                 autosomal_final_table = combine_reads(autosomal_final_table, columns)
-            autosomal_final_table.to_csv(args.out, sep='\t', index=False)
+                if args.separate:
+                    indiv_files(autosomal_final_table, input_name, '.txt')
+                else:
+                    autosomal_final_table.to_csv(args.out, sep='\t', index=False)
         else:
             autosomal_final_table.to_csv(
                 f'{output_name}_no_combined_reads.txt', sep='\t', index=False
             )
     else:
-        autosomal_final_table.to_csv(args.out, sep='\t', index=False)
+        if args.separate:
+            indiv_files(autosomal_final_table, input_name, '.txt')
+        else:
+            autosomal_final_table.to_csv(args.out, sep='\t', index=False)
diff --git a/lusSTR/cli.py b/lusSTR/cli.py
@@ -42,7 +42,9 @@ def annot_subparser(subparsers):
     cli = subparsers.add_parser('annotate')
     cli.add_argument(
         '-o', '--out', metavar='FILE',
-        help='file to which output will be written; default is terminal (stdout)'
+        help='file to which output will be written; default is terminal (stdout). If the '
+        '--separate flag is used, this will be the name of the directory which the individual '
+        'files are written to.'
     )
     cli.add_argument(
         'input', help='sample(s) in CSV format; first four columns must be Locus, NumReads, '
@@ -67,6 +69,10 @@ def annot_subparser(subparsers):
         help='Use if including the X and Y STR markers. Separate reports for these markers '
         'will be created.'
     )
+    cli.add_argument(
+        '--separate', action='store_true',
+        help='This flag will result in the creation of individual output files per sample.'
+    )
 
 
 def snps_subparser(subparsers):
@@ -93,6 +99,10 @@ def snps_subparser(subparsers):
         '--uas', action='store_true',
         help='Use if sequences have been run through the ForenSeq UAS.'
     )
+    cli.add_argument(
+        '--separate', action='store_true',
+        help='This flag will result in the creation of individual output files per sample.'
+    )
 
 
 mains = {

diff --git a/lusSTR/format.py b/lusSTR/format.py
@@ -28,6 +28,8 @@ def uas_load(inpath, sexloci=False):
         sex_strs = pd.DataFrame() if sexloci is True else None
         files = glob.glob(os.path.join(inpath, '*.xlsx'))
         for filename in sorted(files):
+            if 'Sample Details' not in filename:
+                continue
             autodata, sexdata = uas_format(filename, sexloci)
             auto_strs = auto_strs.append(autodata)
             if sexloci is True:

diff --git a/lusSTR/snps.py b/lusSTR/snps.py
@@ -62,8 +62,7 @@ def uas_format(infile, snp_type_arg):
     complemented to be reported on the forward strand; and checks that the called allele is one of
     two expected alleles for the SNP (and flags any SNP call which is unexpected).
     '''
-    data = uas_load(infile, snp_type_arg)
-    data_filt = data.loc[data['Reads'] != 0].reset_index(drop=True)
+    data_filt = uas_load(infile, snp_type_arg).reset_index(drop=True)
     data_df = []
     for j, row in data_filt.iterrows():
         snpid = data_filt.iloc[j, 0]
@@ -74,13 +73,16 @@ def uas_format(infile, snp_type_arg):
             forward_strand_allele = complement_base(uas_allele)
         else:
             forward_strand_allele = uas_allele
-        if forward_strand_allele in metadata['Alleles']:
+        if data_filt.loc[j, 'Typed Allele?'] == 'No':
+            flag = 'Contains untyped allele'
+        elif forward_strand_allele in metadata['Alleles']:
             flag = ''
         else:
             flag = 'Allele call does not match expected allele!'
         row_tmp = [
-            data_filt.iloc[j, 3], data_filt.iloc[j, 4], data_filt.iloc[j, 5], snpid,
-            data_filt.iloc[j, 1], forward_strand_allele, uas_allele, snp_type_dict[type], flag
+            data_filt.loc[j, 'SampleID'], data_filt.loc[j, 'Project'],
+            data_filt.loc[j, 'Analysis'], snpid, data_filt.loc[j, 'Reads'], forward_strand_allele,
+            uas_allele, snp_type_dict[type], flag
         ]
         data_df.append(row_tmp)
     data_final = pd.DataFrame(data_df, columns=[
@@ -133,10 +135,10 @@ def parse_snp_table_from_sheet(infile, sheet, snp_type_arg):
     file = openpyxl.load_workbook(infile)
     file_sheet = file[sheet]
     table = pd.DataFrame(file_sheet.values)
-    offset = table[table.iloc[:, 0] == "Coverage Information"].index.tolist()[0]
+    offset = table[table.iloc[:, 0] == 'Coverage Information'].index.tolist()[0]
     data = table.iloc[offset + 2:]
     data.columns = table.iloc[offset + 1]
-    data = data[['Locus', 'Reads', 'Allele Name']]
+    data = data[['Locus', 'Reads', 'Allele Name', 'Typed Allele?']]
     final_df = pd.DataFrame()
     if snp_type_arg == 'all':
         final_df = data
@@ -332,12 +334,26 @@ def snp_call_exception(seq, expected_size, metadata, base):
         return base, flag
 
 
+def indiv_files(table, input_dir, ext):
+    output_dir = f'Separated_lusstr_Files/{input_dir}'
+    os.makedirs(output_dir, exist_ok=True)
+    for samp in table['SampleID'].unique():
+        new_df = table[table['SampleID'] == samp]
+        new_df.to_csv(f'{output_dir}/{samp}{ext}', sep='\t', index=False)
+
+
 def main(args):
+    output_name = os.path.splitext(args.out)[0]
     if args.uas:
         results = uas_format(args.input, args.type)
-        results.to_csv(args.out, index=False, sep='\t')
+        if args.separate:
+            indiv_files(results, output_name, '.txt')
+        else:
+            results.to_csv(args.out, index=False, sep='\t')
     else:
         results, results_combined = strait_razor_format(args.input, args.type)
-        output_name = os.path.splitext(args.out)[0]
-        results_combined.to_csv(args.out, index=False, sep='\t')
+        if args.separate:
+            indiv_files(results_combined, output_name, '.txt')
+        else:
+            results_combined.to_csv(args.out, index=False, sep='\t')
         results.to_csv(f'{output_name}_full_output.txt', index=False, sep='\t')
diff --git a/lusSTR/tests/data/UAS_bulk_input/Positive Control Sample Details Report 2316.xlsx b/lusSTR/tests/data/UAS_bulk_input/Positive Control Sample Details Report 2316.xlsx