Skip to content

Commit

Permalink
Merge pull request #106 from bioinform/test-params
Browse files Browse the repository at this point in the history
Bugfix
  • Loading branch information
marghoob committed Mar 5, 2016
2 parents 4f634e9 + e88923c commit 7f9ae27
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 133 deletions.
2 changes: 1 addition & 1 deletion metasv/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5"
__version__ = "0.5.1"
48 changes: 4 additions & 44 deletions metasv/run_age.py → metasv/age.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import traceback
import os
import argparse
import multiprocessing
import subprocess
import hashlib
Expand Down Expand Up @@ -90,9 +89,6 @@ def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference
with open(ref_name, "w") as file_handle:
file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence))




age_records = []
thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object)))
for contig in contig_dict[region]:
Expand All @@ -103,7 +99,6 @@ def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference
if region_object.length()>max_interval_len_truncation_age and contig.sv_type in ["INV","DEL","DUP"]:
# For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up
thread_logger.info("Truncate the reference sequence.")


truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age +1
truncate_end = len(reference_sequence) - (pad + dist_to_expected_bp + truncation_pad_read_age)
Expand Down Expand Up @@ -181,8 +176,9 @@ def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference
age_window=age_window, sc_locations=sc_locations)
bedtools_fields = matching_interval.fields
if len(breakpoints) == 1 and sv_type == "INS":
bedtools_fields += map(str, [breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1], breakpoints[0][2]])
elif len(breakpoints) == 2 and (sv_type in ["DEL","INV","DUP"]):
info_dict["INSERTION_SEQUENCE"] = breakpoints[0][2]
bedtools_fields += map(str, [breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1], "."])
elif len(breakpoints) == 2 and (sv_type in ["DEL", "INV", "DUP"]):
bedtools_fields += map(str, breakpoints + [breakpoints[1] - breakpoints[0]] + ["."])
else:
bedtools_fields += map(str, [bedtools_fields[1], bedtools_fields[2], -1, "."])
Expand Down Expand Up @@ -308,40 +304,4 @@ def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_
merged_bed = os.path.join(age_workdir, "breakpoints.bed")
bedtool.sort().saveas(merged_bed)

return merged_bed


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run AGE on files assembled under MetaSV.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--reference", help="Reference FASTA", required=True, type=file)
parser.add_argument("--assembly", help="Assembly FASTA", required=True, type=file)
parser.add_argument("--age", help="Path to AGE executable", required=True, type=file)
parser.add_argument("--work", help="Work directory", default="work")
parser.add_argument("--pad", help="Padding to apply on both sides of the bed regions", type=int, default=AGE_PAD)
parser.add_argument("--nthreads", help="Number of threads to use", type=int, default=1)
parser.add_argument("--chrs", help="Chromosome list to process", nargs="+", default=[])
parser.add_argument("--sv_types", help="SV types list to process (INS, DEL, INV)", nargs="+", default=[])
parser.add_argument("--timeout", help="Max time for assembly processes to run", type=int, default=AGE_TIMEOUT)
parser.add_argument("--keep_temp", help="Don't delete temporary files", action="store_true")
parser.add_argument("--assembly_tool", help="Tool used for assembly", choices=["spades", "tigra"], default="spades")
parser.add_argument("--min_contig_len", help="Minimum length of contig to consider", type=int,
default=AGE_MIN_CONTIG_LENGTH)
parser.add_argument("--max_region_len", help="Maximum length of an SV interval", type=int,
default=AGE_MAX_REGION_LENGTH)
parser.add_argument("--min_del_subalign_len", help="Minimum length of deletion sub-alginment", type=int,
default=MIN_DEL_SUBALIGN_LENGTH)
parser.add_argument("--min_inv_subalign_len", help="Minimum length of inversion sub-alginment", type=int,
default=MIN_INV_SUBALIGN_LENGTH)
parser.add_argument("--age_window", help="Window size for AGE to merge nearby breakpoints", type=int,
default=AGE_WINDOW_SIZE)
parser.add_argument("--intervals_bed", help="BED file for assembly", type=file, required=True)

args = parser.parse_args()

run_age_parallel(intervals_bed=args.intervals_bed.name, reference=args.reference.name, assembly=args.assembly.name,
pad=args.pad, age=args.age.name, age_workdir=args.work, timeout=args.timeout,
keep_temp=args.keep_temp, assembly_tool=args.assembly_tool, chrs=args.chrs, nthreads=args.nthreads,
min_contig_len=args.min_contig_len, max_region_len=args.max_region_len, sv_types=args.sv_types,
min_del_subalign_len=args.min_del_subalign_len, min_inv_subalign_len=args.min_inv_subalign_len,
age_window = args.age_window)
return merged_bed
1 change: 1 addition & 0 deletions metasv/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
SC_MIN_MAPQ = 5
SC_MAX_NM = 10
SC_MIN_MATCHES = 50
SC_OTHER_SCALE = 5


ISIZE_MIN = 250
Expand Down
12 changes: 5 additions & 7 deletions metasv/generate_final_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ def get_interval_info(feature,pass_calls):
info = dict()
if len(feature.fields) > 10:
info.update(json.loads(base64.b64decode(feature.fields[10])))
if feature.fields[9] != ".":
info["INSERTION_SEQUENCE"] = feature.fields[9]

index_to_use = 0
is_pass = False
Expand All @@ -93,7 +91,7 @@ def get_interval_info(feature,pass_calls):
return None

if sv_type != "INS":
svmethods_s = set(svmethods) - {"SC","AS"}
svmethods_s = set(svmethods) - {"SC", "AS"}
is_pass = len(svmethods_s) > 1
if "AS" in svmethods:
pos, end, svlen = map(int, feature.fields[6:9])
Expand All @@ -105,7 +103,7 @@ def get_interval_info(feature,pass_calls):
if sv_type == "DEL":
svlen = -svlen
else:
if ("SC" in svmethods or "AS" in svmethods):
if "SC" in svmethods or "AS" in svmethods:
# TODO: I think it should be sub_types.index
#index_to_use = [i for i,methods in enumerate(sub_methods) if ("SC" in methods) or ("AS" in svmethods)][0]
pos, end, svlen = map(int, feature.fields[6:9])
Expand All @@ -119,7 +117,6 @@ def get_interval_info(feature,pass_calls):
func_logger.info("Variant with pos < 1 encountered. Skipping! %s" % str(feature))
return None


info.update(
{"END": end, "SVLEN": svlen, "SVTYPE": sv_type, "SVMETHOD": svmethods, "NUM_SVMETHODS": len(svmethods)})
if "IMPRECISE" in info:
Expand All @@ -129,6 +126,7 @@ def get_interval_info(feature,pass_calls):
"sv_length": abs(svlen), "svmethods": svmethods, "sv_filter": sv_filter}
return interval_info


def filter_confused_INS_calls(nonfilterd_bed, filterd_bed, wiggle=20):

nonins_intervals=[]
Expand All @@ -140,8 +138,8 @@ def filter_confused_INS_calls(nonfilterd_bed, filterd_bed, wiggle=20):
for interval in bedtool_good_nonINS:
start=interval.start
end=interval.end
nonINS_bp_intervals.append(pybedtools.Interval(interval.chrom,max(start-wiggle,0),start+wiggle))
nonINS_bp_intervals.append(pybedtools.Interval(interval.chrom,max(end-wiggle,0),end+wiggle))
nonINS_bp_intervals.append(pybedtools.Interval(interval.chrom, max(start-wiggle,0),start+wiggle))
nonINS_bp_intervals.append(pybedtools.Interval(interval.chrom, max(end-wiggle,0),end+wiggle))

bedtool_bp_nonINS=pybedtools.BedTool(nonINS_bp_intervals)
bad_INS=bedtool_INS.window(bedtool_bp_nonINS,w=wiggle)
Expand Down
Loading

0 comments on commit 7f9ae27

Please sign in to comment.