From 70adf91e1a77c0a1436281ca42b4e32c3dab1796 Mon Sep 17 00:00:00 2001
From: Valentin Ruano Rubio
Date: Tue, 29 May 2018 14:04:45 -0400
Subject: [PATCH] Vrr single contig reference aligner (#4780)
* Added util class to align against an on-the-fly single contig reference using Bwa-Mem.
The new class returns List already translated from the BwaMenAlignment object returned by the BwaMem aligner.
* Some improvements in RandomDNA
---
.../utils/SingleSequenceReferenceAligner.java | 211 ++++++++++++++++++
.../SingleContigReferenceAlignerUnitTest.java | 145 ++++++++++++
.../hellbender/utils/RandomDNA.java | 150 ++++++++++---
.../hellbender/utils/RandomDNAUnitTest.java | 105 ++++++++-
4 files changed, 576 insertions(+), 35 deletions(-)
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java
new file mode 100644
index 00000000000..80804acd73b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java
@@ -0,0 +1,211 @@
+package org.broadinstitute.hellbender.tools.spark.sv.utils;
+
+import htsjdk.samtools.SAMFlag;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemAligner;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemIndex;
+import org.broadinstitute.hellbender.utils.reference.FastaReferenceWriter;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+/**
+ * Encompasses an aligner to a single-sequence reference.
+ *
+ * This is an {@link AutoCloseable} mean to be use in try-with-resource constructs.
+ *
+ *
+ * If you don't do so, please remember to call {@link #close} at the end to free
+ * resources.
+ *
+ */
+public class SingleSequenceReferenceAligner implements AutoCloseable {
+
+ private final File image;
+ private final BwaMemAligner aligner;
+ private final BwaMemIndex index;
+ private final List refNames;
+ private boolean closed = false;
+
+ @FunctionalInterface
+ public interface TriFunction{
+ W apply(T t, U u, V v);
+ }
+
+ public static final Predicate super BwaMemAlignment> NO_FILTER = x -> true;
+
+ private final Function super T, List> basesOf;
+ private final TriFunction super T, List>, List, ? extends U> alignmentOf;
+ private final Predicate super BwaMemAlignment> alignmentFilter;
+
+ public SingleSequenceReferenceAligner(final String name, final byte[] bases,
+ final Function super T, List> basesOf,
+ final TriFunction super T, List>, List, ? extends U> alignmentOf) {
+ this(name, bases, basesOf, alignmentOf, NO_FILTER);
+ }
+
+ public SingleSequenceReferenceAligner(final String name, final byte[] bases,
+ final Function super T, List> basesOf,
+ final TriFunction super T, List>, List, ? extends U> alignmentOf,
+ final Predicate super BwaMemAlignment> alignmentFilter) {
+ Utils.nonNull(name, "the input reference name cannot be null");
+ Utils.nonNull(bases, "the input bases cannot be null");
+ this.basesOf = Utils.nonNull(basesOf);
+ this.alignmentOf = Utils.nonNull(alignmentOf);
+ this.alignmentFilter = Utils.nonNull(alignmentFilter);
+ Utils.validate(bases.length > 0, "the reference contig bases sequence must have at least one base");
+ try {
+ final File fasta = File.createTempFile("ssvh-temp", ".fasta");
+ fasta.deleteOnExit();
+ image = new File(fasta.getParentFile(), fasta.getName().replace(".fasta", ".img"));
+ image.deleteOnExit();
+ FastaReferenceWriter.writeSingleSequenceReference(fasta.toPath(), false, false, name, null, bases);
+ BwaMemIndex.createIndexImageFromFastaFile(fasta.toString(), image.toString());
+ fasta.delete(); // we don't need the fasta around.
+ index = new BwaMemIndex(image.toString());
+ aligner = new BwaMemAligner(index);
+ } catch (final IOException ex) {
+ throw new GATKException("could not create index files", ex);
+ }
+ refNames = Collections.singletonList(name);
+ }
+
+ /**
+ * Gives access to the underlying aligner so that you can modify its options.
+ *
+ * You could align sequences directly thru the return object but in that case you will lose the
+ * {@link BwaMemAlignment} to {@link AlignmentInterval} translation.
+ *
+ *
+ * @return never {@code null}.
+ */
+ public BwaMemAligner getAligner() {
+ return aligner;
+ }
+
+ public List align(final Iterable extends T> inputs) {
+ Utils.nonNull(inputs);
+ return align(Utils.stream(inputs).collect(Collectors.toList()));
+ }
+
+ /**
+ * Aligns the input object returning a list of the outputs in the corresponding order.
+ * @param inputs
+ * @return
+ */
+ public List align(final List extends T> inputs) {
+ checkNotClosed();
+ Utils.nonNull(inputs, "the input sequence array cannot be null");
+ final List> seqs = inputs.stream().map(basesOf).collect(Collectors.toList());
+ final List flattenSeqs = seqs.stream().flatMap(Collection::stream).collect(Collectors.toList());
+ final List> alignments = aligner.alignSeqs(flattenSeqs);
+ if (alignments.size() != flattenSeqs.size()) { // paranoiah??
+ throw new IllegalStateException("something went terribly wrong and the number of returned alignment list does " +
+ "not correspond to the number of input sequences: " + alignments.size() + " != " + flattenSeqs.size());
+ }
+ final List result = new ArrayList<>(inputs.size());
+ int nextAlignmentIndex = 0;
+ for (int i = 0; i < inputs.size(); i++) {
+ final T inputObject = inputs.get(i);
+ final List sequences = seqs.get(i);
+ final List> relevantAlignments =
+ alignments.subList(nextAlignmentIndex, nextAlignmentIndex += sequences.size());
+ final List> filteredAlignments;
+ if (alignmentFilter == NO_FILTER) {
+ filteredAlignments = relevantAlignments;
+ } else {
+ filteredAlignments = new ArrayList<>(relevantAlignments.size());
+ for (int j = 0; j < relevantAlignments.size(); j++) {
+ filteredAlignments.add(relevantAlignments.get(j).stream().filter(alignmentFilter).collect(Collectors.toList()));
+ }
+ }
+ final U outputObject = alignmentOf.apply(inputObject, filteredAlignments, refNames);
+ result.add(outputObject);
+ }
+ return result;
+ }
+
+ /**
+ * Composes a map of the aligned sequences.
+ *
+ * The key of such a map would be determined by the input object and the output alignment.
+ *
+ *
+ * Iterations over the entries, keys and values of the resulting tree will follow the order
+ * of the input objects. In case of key collisions (more than one input object, alignment result in the same key)
+ * then we only keep the first occurrence of such key.
+ *
+ * @param inputs the input objects to align.
+ * @param keyOf function that composed the key given the input and output objects.
+ * @param
+ * @return never {@code null}.
+ */
+ public Map align(final List extends T> inputs, final BiFunction super T, ? super U, ? extends V> keyOf) {
+ final List outs = align(inputs);
+ final LinkedHashMap result = new LinkedHashMap<>(outs.size());
+ for (int i = 0; i < outs.size(); i++) {
+ result.putIfAbsent(keyOf.apply(inputs.get(i), outs.get(i)), outs.get(i));
+ }
+ return result;
+ }
+
+ /**
+ * Composes a contig aligner from an arbitrary input type given contig name and base sequence generation functions.
+ *
+ * Supplementary alignments will be ignored.
+ *
+ *
+ * @param refName the name of the only reference sequence.
+ * @param refBases the bases for the only reference sequence.
+ * @param nameOf function that produces the aligned contig name based on the input object.
+ * @param basesOf function that produces the contig bases sequence based on the input object.
+ * @param the type-parameter of the input object.
+ * @return never {@code null}.
+ */
+ public static SingleSequenceReferenceAligner contigsAligner(final String refName, final byte[] refBases,
+ final Function super T, String> nameOf,
+ final Function super T, byte[]> basesOf) {
+ return new SingleSequenceReferenceAligner<>(refName, refBases,
+ t -> Collections.singletonList(basesOf.apply(t)),
+ (t, bma, refNames) -> {
+ final String name = nameOf.apply(t);
+ final byte[] bases = basesOf.apply(t);
+ final List intervals = bma.get(0).stream().map(b -> new AlignmentInterval(b, refNames, bases.length)).collect(Collectors.toList());
+ return new AlignedContig(name, bases, intervals);
+ },
+ bma -> bma.getRefId() >= 0 && SAMFlag.SECONDARY_ALIGNMENT.isUnset(bma.getSamFlag()));
+ }
+
+ private void checkNotClosed() {
+ if (closed) {
+ throw new IllegalStateException("operation not allowed once the aligner is closed");
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (!closed) {
+ aligner.close();
+ closed = true;
+ try {
+ index.close();
+ } finally {
+ image.delete();
+ }
+ }
+ }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java
new file mode 100644
index 00000000000..fbd20d012d0
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java
@@ -0,0 +1,145 @@
+package org.broadinstitute.hellbender.tools.spark.sv.utils;
+
+import htsjdk.samtools.Cigar;
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.TextCigarCodec;
+import htsjdk.samtools.util.SequenceUtil;
+import org.apache.commons.math3.util.Pair;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
+import org.broadinstitute.hellbender.utils.RandomDNA;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.CigarUtils;
+import org.broadinstitute.hellbender.utils.test.BaseTest;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.stream.Collectors;
+
+/**
+ * Unit tests for {@link SingleSequenceReferenceAligner}
+ */
+public class SingleContigReferenceAlignerUnitTest extends BaseTest {
+
+ private static final int READ_LENGTH = 250;
+ private static final int NUM_ALIGNS = 1000;
+ private static final String REF_NAME = "ref00";
+
+ @Test(expectedExceptions = IllegalStateException.class, expectedExceptionsMessageRegExp = "^.*closed.*$")
+ public void testClosedException() {
+ final RandomDNA rdn = new RandomDNA(131);
+ final byte[] refBases = rdn.nextBases(1000);
+ final SingleSequenceReferenceAligner aligner = SingleSequenceReferenceAligner.contigsAligner("test", refBases,
+ a -> "contig", a -> a);
+ try {
+ aligner.close();
+ } catch (final IOException ex) {
+ Assert.fail("unexpected exception when closing");
+ }
+ aligner.align(Collections.singletonList(Arrays.copyOfRange(refBases, 100, 200)));
+ }
+
+
+ @Test(dataProvider = "testAlignmentData")
+ public void testAlignment(final boolean pairAlignment, final byte[] reference, final String referenceName)
+ throws IOException
+ {
+ try (final SingleSequenceReferenceAligner aligner = SingleSequenceReferenceAligner.contigsAligner(referenceName, reference,
+ a -> "ctg", a -> a);) {
+ Assert.assertNotNull(aligner.getAligner());
+ if (pairAlignment) {
+ aligner.getAligner().alignPairs();
+ }
+ final Random rdn = new Random(13111);
+ final RandomDNA rdnDNA = new RandomDNA(rdn);
+ final List> expected = new ArrayList<>(NUM_ALIGNS << 1);
+ final List seqs = new ArrayList<>(NUM_ALIGNS << 1);
+ for (int i = 0; i < NUM_ALIGNS; i++) {
+ final int start = rdn.nextInt(reference.length - READ_LENGTH) + 1;
+ final boolean forward = rdn.nextBoolean();
+ final boolean insert = rdn.nextDouble() < 0.1;
+ final boolean deletion = !insert && rdn.nextDouble() < 0.1;
+ final int indelLength = insert || deletion ? rdn.nextInt(10) + 10 : 0;
+ final int indelStart = insert || deletion ? rdn.nextInt((int) (READ_LENGTH * .50)) + 25 : -1;
+ final int end = start + READ_LENGTH - 1;
+ final byte[] templateSeq = Arrays.copyOfRange(reference, start - 1, end);
+ final byte[] actualSeq;
+ if (insert) {
+ actualSeq = Arrays.copyOf(templateSeq, templateSeq.length + indelLength);
+ System.arraycopy(actualSeq, indelStart - 1, actualSeq, indelStart - 1 + indelLength, templateSeq.length - indelStart + 1);
+ rdnDNA.nextBases(actualSeq, indelStart - 1, indelLength);
+ } else if (deletion) {
+ actualSeq = Arrays.copyOf(templateSeq, templateSeq.length - indelLength);
+ System.arraycopy(templateSeq, indelStart - 1 + indelLength, actualSeq, indelStart - 1, templateSeq.length - indelStart + 1 - indelLength);
+ } else {
+ actualSeq = templateSeq.clone();
+ }
+
+ while (insert && actualSeq[indelStart - 1] == actualSeq[indelStart + indelLength - 1]) {
+ actualSeq[indelStart + indelLength - 1] = rdnDNA.nextBase();
+ }
+ if (!forward) {
+ SequenceUtil.reverseComplement(actualSeq);
+ }
+ seqs.add(actualSeq);
+ final Cigar cigar = (!insert && !deletion) ? TextCigarCodec.decode(READ_LENGTH + "M"):
+ (insert ? TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "I" + (READ_LENGTH - indelStart + 1) + "M")
+ : TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "D" + (READ_LENGTH - indelStart + 1 - indelLength) + "M"));
+
+ expected.add(Collections.singletonList(new AlignmentInterval(new SimpleInterval(referenceName, start, end), 1, actualSeq.length, !forward ? CigarUtils.invertCigar(cigar) : cigar
+ , forward, 0, 0, 0, null)));
+ }
+ final List results = aligner.align(seqs);
+ final Map mapResult = aligner.align(seqs, (b, a) -> b);
+ Assert.assertEquals(results, new ArrayList<>(mapResult.values()));
+ Assert.assertEquals(new ArrayList<>(mapResult.keySet()), mapResult.values().stream().map(AlignedContig::getContigSequence).collect(Collectors.toList()));
+ for (int i = 0; i < NUM_ALIGNS; i++) {
+ final List actualValue = results.get(i).getAlignments();
+ final List expectedValue = expected.get(i);
+ Assert.assertEquals(actualValue.size(), 1);
+ Assert.assertEquals(actualValue.get(0).forwardStrand, expectedValue.get(0).forwardStrand);
+ Assert.assertEquals(actualValue.get(0).referenceSpan, expectedValue.get(0).referenceSpan, expectedValue.get(0).cigarAlong5to3DirectionOfContig.toString());
+ Assert.assertEquals(actualValue.get(0).startInAssembledContig, expectedValue.get(0).startInAssembledContig);
+ Assert.assertEquals(actualValue.get(0).endInAssembledContig, expectedValue.get(0).endInAssembledContig);
+ final Cigar expectedCigar = expectedValue.get(0).cigarAlong5to3DirectionOfContig;
+ final Cigar actualCigar = actualValue.get(0).cigarAlong5to3DirectionOfContig;
+ if (!expectedCigar.equals(actualCigar)) { // small differences may occur due to ambiguous indel location. So we check that they are small differences indeed:
+ Assert.assertEquals(expectedCigar.numCigarElements(), actualCigar.numCigarElements()); // same number of elements
+ Assert.assertEquals(expectedCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList()),
+ actualCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList())); // same operators sequence.
+ // then we check the total lengths per operator (must be the same):
+ final Map expectedLengthByOperator = expectedCigar.getCigarElements().stream()
+ .collect(Collectors.groupingBy(CigarElement::getOperator,
+ Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b)));
+ final Map actualLengthByOperator = actualCigar.getCigarElements().stream()
+ .collect(Collectors.groupingBy(CigarElement::getOperator,
+ Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b)));
+ Assert.assertEquals(actualLengthByOperator, expectedLengthByOperator);
+ // finally we don't allow more than 5 bases length difference for any given element.
+ for (int j = 0; j < expectedCigar.numCigarElements(); j++) {
+ Assert.assertTrue(Math.abs(expectedCigar.getCigarElement(j).getLength() - actualCigar.getCigarElement(j).getLength()) < 10, "actual: " + actualCigar + " != expected: " + expectedCigar);
+ }
+ }
+ }
+ }
+ }
+
+ @DataProvider(name="testAlignmentData")
+ public Object[][] testAlignmentData() {
+ final List
*
+ *
+ * This class is written in a way that ensure that the sequence of bases returned would only depend on the seed and
+ * not the sequence of calls to different base generating methods.
+ *
+ *
+ * For example a single call asking for 132 bases using {@link #nextBases(int)} would generate the same sequence
+ * as calling 132 times {@link #nextBase()} if both random instances were initialized with the same seed.
+ *
+ *
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
*/
public final class RandomDNA {
- private final Random random;
+ /**
+ * Number of bases that can be packed in a int (bits in an int divide by 2 bits per base).
+ *
+ * In practice the code below would work if this constant takes any values between 3 and the actual number
+ * of bit pairs in an int (16). The maximum (16) is recommended.
+ *
+ */
+ @VisibleForTesting
+ static final int BASES_IN_AN_INT = Integer.SIZE >> 1; // = 16; and is extremely unlikely to ever change.
+
+ /**
+ * Convenient constant
+ */
+ private static final int BASES_IN_AN_INT_MINUS_1 = BASES_IN_AN_INT - 1;
+
+ /**
+ * Maximum capacity of the next bytes queue.
+ *
+ * It must be at least {@link #BASES_IN_AN_INT} and ideally a multiple of that number
+ * (in order to avoid unused space in the base-buffer).
+ */
+ @VisibleForTesting
+ static final int NEXT_BASES_MAX_CAPACITY = BASES_IN_AN_INT * 64; // = 1024 bases/bytes extremelly unlikely to ever change.
+
+ protected final Random random;
+
+ private final byte[] codeToBase = new byte[] {
+ Nucleotide.A.toBase(),
+ Nucleotide.C.toBase(),
+ Nucleotide.G.toBase(),
+ Nucleotide.T.toBase()
+ };
+
+ private final byte[] nextBases;
+ private int firstBaseIndex;
+ private int lastBaseIndex;
+ private int nextBasesSize;
/**
* Constructs a new random DNA generator.
@@ -47,6 +90,10 @@ public RandomDNA() {
*/
public RandomDNA(final Random rnd) {
random = Utils.nonNull(rnd, "the random number generator cannot be null");
+ nextBases = new byte[NEXT_BASES_MAX_CAPACITY];
+ firstBaseIndex = 0;
+ lastBaseIndex = -1;
+ nextBasesSize = 0;
}
/**
@@ -66,10 +113,10 @@ public RandomDNA(final long seed) {
*
*
* @param destination the array to update.
- * @throws NullPointerException if {@code destination} is {@code null}.
+ * @throws IllegalArgumentException if {@code destination} is {@code null}.
*/
public void nextBases(final byte[] destination) {
- nextBases(destination, 0, destination.length);
+ nextBases(Utils.nonNull(destination, "the destination array must not be null"), 0, destination.length);
}
/**
@@ -87,31 +134,54 @@ public void nextBases(final byte[] dest, final int offset, final int length) {
ParamUtils.isPositiveOrZero(length, "the input length must be positive");
final int to = offset + length;
ParamUtils.inRange(to, 0, dest.length, "the length provided goes beyond the end of the destination array");
- for (int i = offset; i < to; ) {
- int randomInt = random.nextInt();
- for (int j = 0; i < to && j < Integer.SIZE; j += 2) {
- final int ord = randomInt & 0x000000003;
- randomInt >>= 2;
- switch (ord) {
- case 0:
- dest[i++] = 'A';
- break;
- case 1:
- dest[i++] = 'C';
- break;
- case 2:
- dest[i++] = 'G';
- break;
- case 3:
- dest[i++] = 'T';
- break;
- default:
- throw new GATKException.ShouldNeverReachHereException("this cannot be happening!!! " + ord);
- }
+ int remaining = length;
+ int nextDestIndex = offset;
+ while (remaining > 0) {
+ final int available = loadNextBases(remaining);
+ for (int i = 0; i < available; i++) {
+ dest[nextDestIndex++] = removeNextBase();
}
+ remaining -= available;
}
}
+ /**
+ * Load new bases to the next bases queue to try to satisfy a required number of next-bases.
+ *
+ *
+ * It might be that there is not enough space in the buffer to satisfy the target, in which case this will return
+ * something close to the capacity of the buffer, expecting the caller to clear the buffer and repeat the request
+ * for the remainder.
+ *
+ *
+ * @param targetSize the number of bases that the caller would like to have at its disposal.
+ * @return the amount of bases available given the current content of the next-bases queue, its max capacity
+ * and the requested target.
+ */
+ private int loadNextBases(final int targetSize) {
+ final int currentNumberOfBases = nextBasesSize;
+ if (currentNumberOfBases >= targetSize) {
+ return targetSize;
+ } else if (NEXT_BASES_MAX_CAPACITY - currentNumberOfBases < BASES_IN_AN_INT) {
+ // If we try to load more bases it would go over the max capacity.
+ return currentNumberOfBases;
+ } else {
+ final int result = Math.min(NEXT_BASES_MAX_CAPACITY, targetSize);
+ final int increaseInInts = (result - currentNumberOfBases + BASES_IN_AN_INT_MINUS_1) / BASES_IN_AN_INT;
+ for (int i = 0; i < increaseInInts; i++) {
+ int randomInt = random.nextInt();
+ addNextBase(codeToBase[randomInt & 0x03]);
+ int j = 1;
+ do {
+ randomInt >>>= 2;
+ addNextBase(codeToBase[randomInt & 0x03]);
+ } while(++j < BASES_IN_AN_INT_MINUS_1);
+ randomInt >>>= 2;
+ addNextBase(codeToBase[randomInt]); // for the last base there is no need to bother with the mask.
+ }
+ return result;
+ }
+ }
/**
* Returns a random DNA sequence of bases.
@@ -130,7 +200,7 @@ public byte[] nextBases(final int size) {
/**
* Create a random reference and writes in FASTA format into a temporal file.
*
- * The output file is instanciated using {@link File#createTempFile}.
+ * The output file is instantiated using {@link File#createTempFile}.
*
*
* The invoking code is responsible to manage and dispose of the output file.
@@ -194,4 +264,28 @@ public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final
}
}
}
+
+ public byte nextBase() {
+ loadNextBases(1); // this never will fail to have 1 available base.
+ return removeNextBase();
+ }
+
+
+ private void addNextBase(final byte base) {
+ lastBaseIndex++;
+ if (lastBaseIndex == NEXT_BASES_MAX_CAPACITY) {
+ lastBaseIndex = 0;
+ }
+ nextBases[lastBaseIndex] = base;
+ nextBasesSize++;
+ }
+
+ private byte removeNextBase() {
+ final byte result = nextBases[firstBaseIndex++];
+ if (firstBaseIndex == NEXT_BASES_MAX_CAPACITY) {
+ firstBaseIndex = 0;
+ }
+ nextBasesSize--;
+ return result;
+ }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java
index 31803ab6096..6b4add1ce9d 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java
@@ -19,6 +19,9 @@
import java.util.List;
import java.util.stream.IntStream;
+/**
+ * Unit tests for {@link RandomDNA}.
+ */
public final class RandomDNAUnitTest {
private static final int TEST_BASES_PER_LINE = 73;
@@ -35,6 +38,101 @@ public void testRandomFasta(final SAMSequenceDictionary dict) throws IOException
}
}
+ @Test
+ public void testBufferMaxCapacity() {
+ Assert.assertTrue(RandomDNA.NEXT_BASES_MAX_CAPACITY >= RandomDNA.BASES_IN_AN_INT, "invalid value for NEXT_BASES_MAX_CAPACITY");
+ }
+
+ @Test
+ public void testBasesInIntConstantConstraints() {
+ Assert.assertTrue(RandomDNA.BASES_IN_AN_INT >= 3);
+ Assert.assertTrue(RandomDNA.BASES_IN_AN_INT <= Integer.SIZE / 2);
+ }
+
+ /**
+ * Checks that several ways to compose an array of bases actually result on the same sequence of bases
+ * given a fixed seed.
+ */
+ @Test
+ public void testSequenceEquivalenceShort() {
+ final int seed = 1311;
+ final RandomDNA rdn1 = new RandomDNA(seed);
+ final RandomDNA rdn2 = new RandomDNA(seed);
+ final RandomDNA rdn3 = new RandomDNA(seed);
+ final RandomDNA rdn4 = new RandomDNA(seed);
+ final RandomDNA rdn5 = new RandomDNA(seed);
+
+ final byte[] seq1 = rdn1.nextBases(132);
+
+ final byte[] seq2 = new byte[132];
+ for (int i = 0; i < seq2.length; i++) {
+ seq2[i] = rdn2.nextBase();
+ }
+
+ final byte[] seq3 = new byte[132];
+ rdn3.nextBases(seq3);
+
+ final byte[] seq4 = new byte[132];
+ rdn4.nextBases(seq4, 0, 100);
+ rdn4.nextBases(seq4, 100, 32);
+
+ final byte[] seq5 = new byte[132];
+ rdn5.nextBases(seq5, 0, 90);
+ for (int i = 0; i < 10; i++)
+ seq5[90 + i] = rdn5.nextBase();
+ rdn5.nextBases(seq5, 100, 10);
+ for (int i = 0; i < 22; i++) {
+ seq5[110 + i] = rdn5.nextBase();
+ }
+
+ Assert.assertEquals(seq1, seq2);
+ Assert.assertEquals(seq1, seq3);
+ Assert.assertEquals(seq1, seq4);
+ Assert.assertEquals(seq1, seq5);
+ }
+
+ @Test
+ public void testSequenceEquivalenceLong() {
+ final int seed = 1311;
+ final RandomDNA rdn1 = new RandomDNA(seed);
+ final RandomDNA rdn2 = new RandomDNA(seed);
+ final RandomDNA rdn3 = new RandomDNA(seed);
+ final RandomDNA rdn4 = new RandomDNA(seed);
+ final RandomDNA rdn5 = new RandomDNA(seed);
+
+ final byte[] seq1 = rdn1.nextBases(1320);
+
+ final byte[] seq2 = new byte[1320];
+ for (int i = 0; i < seq2.length; i++) {
+ seq2[i] = rdn2.nextBase();
+ }
+
+ final byte[] seq3 = new byte[1320];
+ rdn3.nextBases(seq3);
+
+ final byte[] seq4 = new byte[1320];
+ rdn4.nextBases(seq4, 0, 1000);
+ rdn4.nextBases(seq4, 1000, 320);
+
+ final byte[] seq5 = new byte[1320];
+ rdn5.nextBases(seq5, 0, 900);
+ for (int i = 0; i < 100; i++)
+ seq5[900 + i] = rdn5.nextBase();
+ rdn5.nextBases(seq5, 1000, 100);
+ for (int i = 0; i < 220; i++) {
+ seq5[1100 + i] = rdn5.nextBase();
+ }
+
+ for (int i = 0; i < 1320; i++) {
+ Assert.assertEquals(seq1[i], seq2[i], "" + i);
+ }
+ Assert.assertEquals(seq1, seq2);
+ Assert.assertEquals(seq1, seq3);
+ Assert.assertEquals(seq1, seq4);
+ Assert.assertEquals(seq1, seq5);
+ }
+
+
private void assertFastaFileAndDictMatch(final File fastaFile, final int basesPerLine, final SAMSequenceDictionary dict) {
try (final BufferedReader reader = new BufferedReader(new FileReader(fastaFile))) {
final List> nameLengthAndFreqs = new ArrayList<>();
@@ -130,13 +228,6 @@ public void checkResults(final int[] results, final int n, final int m) {
Assert.assertTrue(mean > expectedMean-2*s/Math.sqrt(n*m), "unexpected mean:" +mean);
}
- public void assertUniformity(final Nucleotide.Counter freqs) {
- final long[] observed = new long[] { freqs.get(Nucleotide.A), freqs.get(Nucleotide.C), freqs.get(Nucleotide.G), freqs.get(Nucleotide.T)};
- final ChiSquareTest test = new ChiSquareTest();
- final double pValue = test.chiSquareTest(new double[] { 0.25, 0.25, 0.25, 0.25 }, observed );
- Assert.assertTrue( pValue > 0.001);
- }
-
private int[] pairwiseAdd(int[] a, int[] b) {
Utils.validateArg(a.length == b.length, "lengths must be equal");
return IntStream.range(0, a.length).map(n -> a[n] + b[n]).toArray();