Skip to content

Commit

Permalink
Disable skipping regions without candidates during training.
Browse files Browse the repository at this point in the history
Now the speedup only applies when there are proposed variants, that is when running with VcfCandidateImporter.

PiperOrigin-RevId: 348500773
  • Loading branch information
MariaNattestad authored and copybara-github committed Dec 21, 2020
1 parent 87e7c60 commit 4a11046
Show file tree
Hide file tree
Showing 23 changed files with 30 additions and 35 deletions.
53 changes: 24 additions & 29 deletions deepvariant/make_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,35 +1973,30 @@ def processing_regions_from_options(options):
num_shards=options.num_shards)

region_list = list(regions)
# When processing many regions, check for a VCF to narrow down the regions.
if not gvcf_output_enabled(options) and len(region_list) > 10000:
if in_training_mode(options):
filter_vcf = options.truth_variants_filename
logging_with_options(
options, 'Reading VCF to see if we can skip processing some regions '
'without variants in the --truth_variants VCF.')
else:
filter_vcf = options.proposed_variants_filename
if filter_vcf:
logging_with_options(
options, 'Reading VCF to skip processing some regions without '
'variants in the --proposed_variants VCF.')
if filter_vcf:
before = time.time()
variant_positions = []
with vcf.VcfReader(filter_vcf) as vcf_reader:
for variant in vcf_reader:
variant_positions.append(variant_utils.variant_position(variant))

filtered_regions = filter_regions_by_vcf(region_list, variant_positions)
time_elapsed = time.time() - before
logging_with_options(
options,
'Filtering regions took {} seconds and reduced the number of '
'regions to process from {} to {} regions containing variants '
'from the supplied VCF.'.format(
round(time_elapsed, 2), len(region_list), len(filtered_regions)))
return filtered_regions
# When using VcfCandidateImporter, it is safe to skip regions without
# candidates as long as gVCF output is not needed. There is a tradeoff
# though because it takes time to read the VCF, which is only worth it if
# there are enough regions.
if options.proposed_variants_filename and len(
region_list) > 10000 and not gvcf_output_enabled(options):
logging_with_options(
options, 'Reading VCF to skip processing some regions without '
'variants in the --proposed_variants VCF.')
before = time.time()
variant_positions = []
with vcf.VcfReader(options.proposed_variants_filename) as vcf_reader:
for variant in vcf_reader:
variant_positions.append(variant_utils.variant_position(variant))

filtered_regions = filter_regions_by_vcf(region_list, variant_positions)
time_elapsed = time.time() - before
logging_with_options(
options, 'Filtering regions took {} seconds and reduced the number of '
'regions to process from {} to {} regions containing variants '
'from the supplied VCF of proposed variants.'.format(
trim_runtime(time_elapsed), len(region_list),
len(filtered_regions)))
return filtered_regions
return region_list


Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified deepvariant/testdata/golden.calling_candidates.tfrecord.gz
Binary file not shown.
Binary file modified deepvariant/testdata/golden.calling_examples.tfrecord.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified deepvariant/testdata/golden.postprocess_gvcf_input.tfrecord.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified deepvariant/testdata/golden.training_examples.tfrecord
Binary file not shown.
Binary file modified deepvariant/testdata/golden.training_examples.tfrecord.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3530,12 +3530,12 @@ labeling_metrics {
resource_metrics {
host_name: "host"
physical_core_count: 6
cpu_frequency_mhz: 1249.1485
cpu_frequency_mhz: 1200.1029999999998
total_memory_mb: 63978
wall_time_seconds: 0.9555103778839111
cpu_user_time_seconds: 5.931328
cpu_system_time_seconds: 2.363702
memory_peak_rss_mb: 389
read_bytes: 799694848
wall_time_seconds: 1.744377613067627
cpu_user_time_seconds: 5.804764
cpu_system_time_seconds: 2.572658
memory_peak_rss_mb: 381
read_bytes: 807477248
write_bytes: 20480
}
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 4a11046

Please sign in to comment.