From 70adf91e1a77c0a1436281ca42b4e32c3dab1796 Mon Sep 17 00:00:00 2001
From: Valentin Ruano Rubio <valentin@broadinstitute.org>
Date: Tue, 29 May 2018 14:04:45 -0400
Subject: [PATCH] Vrr single contig reference aligner (#4780)

* Added util class to align against an on-the-fly single contig reference using Bwa-Mem.

The new class returns List<AlignmentInterval> already translated from the BwaMenAlignment object returned by the BwaMem aligner.

* Some improvements in RandomDNA
---
 .../utils/SingleSequenceReferenceAligner.java | 211 ++++++++++++++++++
 .../SingleContigReferenceAlignerUnitTest.java | 145 ++++++++++++
 .../hellbender/utils/RandomDNA.java           | 150 ++++++++++---
 .../hellbender/utils/RandomDNAUnitTest.java   | 105 ++++++++-
 4 files changed, 576 insertions(+), 35 deletions(-)
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java
 create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java
new file mode 100644
index 00000000000..80804acd73b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java
@@ -0,0 +1,211 @@
+package org.broadinstitute.hellbender.tools.spark.sv.utils;
+
+import htsjdk.samtools.SAMFlag;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemAligner;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
+import org.broadinstitute.hellbender.utils.bwa.BwaMemIndex;
+import org.broadinstitute.hellbender.utils.reference.FastaReferenceWriter;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+/**
+ * Encompasses an aligner to a single-sequence reference.
+ * <p>
+ *     This is an {@link AutoCloseable} mean to be use in try-with-resource constructs.
+ * </p>
+ * <p>
+ *     If you don't do so, please remember to call {@link #close} at the end to free
+ *     resources.
+ * </p>
+ */
+public class SingleSequenceReferenceAligner<T, U> implements AutoCloseable {
+
+    private final File image;
+    private final BwaMemAligner aligner;
+    private final BwaMemIndex index;
+    private final List<String> refNames;
+    private boolean closed = false;
+
+    @FunctionalInterface
+    public interface TriFunction<T, U, V, W>{
+        W apply(T t, U u, V v);
+    }
+
+    public static final Predicate<? super BwaMemAlignment> NO_FILTER = x -> true;
+
+    private final Function<? super T, List<byte[]>> basesOf;
+    private final TriFunction<? super T, List<List<BwaMemAlignment>>, List<String>, ? extends U> alignmentOf;
+    private final Predicate<? super BwaMemAlignment> alignmentFilter;
+
+    public SingleSequenceReferenceAligner(final String name, final byte[] bases,
+                                          final Function<? super T, List<byte[]>> basesOf,
+                                          final TriFunction<? super T, List<List<BwaMemAlignment>>, List<String>, ? extends U> alignmentOf) {
+        this(name, bases, basesOf, alignmentOf, NO_FILTER);
+    }
+
+    public SingleSequenceReferenceAligner(final String name, final byte[] bases,
+                                          final Function<? super T, List<byte[]>> basesOf,
+                                          final TriFunction<? super T, List<List<BwaMemAlignment>>, List<String>, ? extends U> alignmentOf,
+                                          final Predicate<? super BwaMemAlignment> alignmentFilter) {
+        Utils.nonNull(name, "the input reference name cannot be null");
+        Utils.nonNull(bases, "the input bases cannot be null");
+        this.basesOf = Utils.nonNull(basesOf);
+        this.alignmentOf = Utils.nonNull(alignmentOf);
+        this.alignmentFilter = Utils.nonNull(alignmentFilter);
+        Utils.validate(bases.length > 0, "the reference contig bases sequence must have at least one base");
+        try {
+            final File fasta = File.createTempFile("ssvh-temp", ".fasta");
+            fasta.deleteOnExit();
+            image = new File(fasta.getParentFile(), fasta.getName().replace(".fasta", ".img"));
+            image.deleteOnExit();
+            FastaReferenceWriter.writeSingleSequenceReference(fasta.toPath(), false, false, name, null, bases);
+            BwaMemIndex.createIndexImageFromFastaFile(fasta.toString(), image.toString());
+            fasta.delete(); // we don't need the fasta around.
+            index = new BwaMemIndex(image.toString());
+            aligner = new BwaMemAligner(index);
+        } catch (final IOException ex) {
+            throw new GATKException("could not create index files", ex);
+        }
+        refNames = Collections.singletonList(name);
+    }
+
+    /**
+     * Gives access to the underlying aligner so that you can modify its options.
+     * <p>
+     *     You could align sequences directly thru the return object but in that case you will lose the
+     *     {@link BwaMemAlignment} to {@link AlignmentInterval} translation.
+     * </p>
+     *
+     * @return never {@code null}.
+     */
+    public BwaMemAligner getAligner() {
+        return aligner;
+    }
+
+    public List<U> align(final Iterable<? extends T> inputs) {
+        Utils.nonNull(inputs);
+        return align(Utils.stream(inputs).collect(Collectors.toList()));
+    }
+
+    /**
+     * Aligns the input object returning a list of the outputs in the corresponding order.
+     * @param inputs
+     * @return
+     */
+    public List<U> align(final List<? extends T> inputs) {
+        checkNotClosed();
+        Utils.nonNull(inputs, "the input sequence array cannot be null");
+        final List<List<byte[]>> seqs = inputs.stream().map(basesOf).collect(Collectors.toList());
+        final List<byte[]> flattenSeqs = seqs.stream().flatMap(Collection::stream).collect(Collectors.toList());
+        final List<List<BwaMemAlignment>> alignments = aligner.alignSeqs(flattenSeqs);
+        if (alignments.size() != flattenSeqs.size()) { // paranoiah??
+            throw new IllegalStateException("something went terribly wrong and the number of returned alignment list does " +
+                    "not correspond to the number of input sequences: " + alignments.size() + " != " + flattenSeqs.size());
+        }
+        final List<U> result = new ArrayList<>(inputs.size());
+        int nextAlignmentIndex = 0;
+        for (int i = 0; i < inputs.size(); i++) {
+            final T inputObject = inputs.get(i);
+            final List<byte[]> sequences = seqs.get(i);
+            final List<List<BwaMemAlignment>> relevantAlignments =
+                    alignments.subList(nextAlignmentIndex, nextAlignmentIndex += sequences.size());
+            final List<List<BwaMemAlignment>> filteredAlignments;
+            if (alignmentFilter == NO_FILTER) {
+                filteredAlignments = relevantAlignments;
+            } else {
+                filteredAlignments = new ArrayList<>(relevantAlignments.size());
+                for (int j = 0; j < relevantAlignments.size(); j++) {
+                    filteredAlignments.add(relevantAlignments.get(j).stream().filter(alignmentFilter).collect(Collectors.toList()));
+                }
+            }
+            final U outputObject = alignmentOf.apply(inputObject, filteredAlignments, refNames);
+            result.add(outputObject);
+        }
+        return result;
+    }
+
+    /**
+     * Composes a map of the aligned sequences.
+     * <p>
+     *     The key of such a map would be determined by the input object and the output alignment.
+     * </p>
+     * <p>
+     *     Iterations over the entries, keys and values of the resulting tree will follow the order
+     *     of the input objects. In case of key collisions (more than one input object, alignment result in the same key)
+     *     then we only keep the first occurrence of such key.
+     * </p>
+     * @param inputs the input objects to align.
+     * @param keyOf function that composed the key given the input and output objects.
+     * @param <V>
+     * @return never {@code null}.
+     */
+    public <V> Map<V, U> align(final List<? extends T> inputs, final BiFunction<? super T, ? super U, ? extends V> keyOf) {
+        final List<U> outs = align(inputs);
+        final LinkedHashMap<V, U> result = new LinkedHashMap<>(outs.size());
+        for (int i = 0; i < outs.size(); i++) {
+            result.putIfAbsent(keyOf.apply(inputs.get(i), outs.get(i)), outs.get(i));
+        }
+        return result;
+    }
+
+    /**
+     * Composes a contig aligner from an arbitrary input type given contig name and base sequence generation functions.
+     * <p>
+     *     Supplementary alignments will be ignored.
+     * </p>
+     *
+     * @param refName the name of the only reference sequence.
+     * @param refBases the bases for the only reference sequence.
+     * @param nameOf function that produces the aligned contig name based on the input object.
+     * @param basesOf function that produces the contig bases sequence based on the input object.
+     * @param <T> the type-parameter of the input object.
+     * @return never {@code null}.
+     */
+    public static <T> SingleSequenceReferenceAligner<T, AlignedContig> contigsAligner(final String refName, final byte[] refBases,
+                                                                                      final Function<? super T, String> nameOf,
+                                                                                      final Function<? super T, byte[]> basesOf) {
+        return new SingleSequenceReferenceAligner<>(refName, refBases,
+                t -> Collections.singletonList(basesOf.apply(t)),
+                (t, bma, refNames) -> {
+                        final String name = nameOf.apply(t);
+                        final byte[] bases = basesOf.apply(t);
+                        final List<AlignmentInterval> intervals = bma.get(0).stream().map(b -> new AlignmentInterval(b, refNames, bases.length)).collect(Collectors.toList());
+                        return new AlignedContig(name, bases, intervals);
+                },
+                bma -> bma.getRefId() >= 0 && SAMFlag.SECONDARY_ALIGNMENT.isUnset(bma.getSamFlag()));
+    }
+
+    private void checkNotClosed() {
+        if (closed) {
+            throw new IllegalStateException("operation not allowed once the aligner is closed");
+        }
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (!closed) {
+            aligner.close();
+            closed = true;
+            try {
+                index.close();
+            } finally {
+                image.delete();
+            }
+        }
+    }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java
new file mode 100644
index 00000000000..fbd20d012d0
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java
@@ -0,0 +1,145 @@
+package org.broadinstitute.hellbender.tools.spark.sv.utils;
+
+import htsjdk.samtools.Cigar;
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.TextCigarCodec;
+import htsjdk.samtools.util.SequenceUtil;
+import org.apache.commons.math3.util.Pair;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
+import org.broadinstitute.hellbender.utils.RandomDNA;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.read.CigarUtils;
+import org.broadinstitute.hellbender.utils.test.BaseTest;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.stream.Collectors;
+
+/**
+ * Unit tests for {@link SingleSequenceReferenceAligner}
+ */
+public class SingleContigReferenceAlignerUnitTest extends BaseTest {
+
+    private static final int READ_LENGTH = 250;
+    private static final int NUM_ALIGNS = 1000;
+    private static final String REF_NAME = "ref00";
+
+    @Test(expectedExceptions = IllegalStateException.class, expectedExceptionsMessageRegExp = "^.*closed.*$")
+    public void testClosedException() {
+        final RandomDNA rdn = new RandomDNA(131);
+        final byte[] refBases = rdn.nextBases(1000);
+        final SingleSequenceReferenceAligner<byte[], AlignedContig> aligner = SingleSequenceReferenceAligner.contigsAligner("test", refBases,
+                a -> "contig", a -> a);
+        try {
+            aligner.close();
+        } catch (final IOException ex) {
+            Assert.fail("unexpected exception when closing");
+        }
+        aligner.align(Collections.singletonList(Arrays.copyOfRange(refBases, 100, 200)));
+    }
+
+
+    @Test(dataProvider = "testAlignmentData")
+    public void testAlignment(final boolean pairAlignment, final byte[] reference, final String referenceName)
+        throws IOException
+    {
+        try (final SingleSequenceReferenceAligner<byte[], AlignedContig> aligner =  SingleSequenceReferenceAligner.contigsAligner(referenceName, reference,
+                a -> "ctg", a -> a);) {
+            Assert.assertNotNull(aligner.getAligner());
+            if (pairAlignment) {
+                aligner.getAligner().alignPairs();
+            }
+            final Random rdn = new Random(13111);
+            final RandomDNA rdnDNA = new RandomDNA(rdn);
+            final List<List<AlignmentInterval>> expected = new ArrayList<>(NUM_ALIGNS << 1);
+            final List<byte[]> seqs = new ArrayList<>(NUM_ALIGNS << 1);
+            for (int i = 0; i < NUM_ALIGNS; i++) {
+                final int start = rdn.nextInt(reference.length - READ_LENGTH) + 1;
+                final boolean forward = rdn.nextBoolean();
+                final boolean insert = rdn.nextDouble() < 0.1;
+                final boolean deletion = !insert && rdn.nextDouble() < 0.1;
+                final int indelLength = insert || deletion ? rdn.nextInt(10) + 10 : 0;
+                final int indelStart = insert || deletion ? rdn.nextInt((int) (READ_LENGTH  * .50)) + 25 : -1;
+                final int end = start + READ_LENGTH - 1;
+                final byte[] templateSeq = Arrays.copyOfRange(reference, start - 1, end);
+                final byte[] actualSeq;
+                if (insert) {
+                    actualSeq = Arrays.copyOf(templateSeq, templateSeq.length + indelLength);
+                    System.arraycopy(actualSeq, indelStart - 1, actualSeq, indelStart - 1 + indelLength, templateSeq.length - indelStart + 1);
+                    rdnDNA.nextBases(actualSeq, indelStart - 1, indelLength);
+                } else if (deletion) {
+                    actualSeq = Arrays.copyOf(templateSeq, templateSeq.length - indelLength);
+                    System.arraycopy(templateSeq, indelStart - 1 + indelLength, actualSeq, indelStart - 1, templateSeq.length - indelStart + 1 - indelLength);
+                } else {
+                    actualSeq = templateSeq.clone();
+                }
+
+                while (insert && actualSeq[indelStart - 1] == actualSeq[indelStart + indelLength - 1]) {
+                    actualSeq[indelStart + indelLength - 1] = rdnDNA.nextBase();
+                }
+                if (!forward) {
+                    SequenceUtil.reverseComplement(actualSeq);
+                }
+                seqs.add(actualSeq);
+                final Cigar cigar = (!insert && !deletion) ? TextCigarCodec.decode(READ_LENGTH + "M"):
+                        (insert ? TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "I" + (READ_LENGTH - indelStart + 1) + "M")
+                                : TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "D" + (READ_LENGTH - indelStart + 1 - indelLength) + "M"));
+
+                expected.add(Collections.singletonList(new AlignmentInterval(new SimpleInterval(referenceName, start, end), 1, actualSeq.length, !forward ? CigarUtils.invertCigar(cigar) : cigar
+                        , forward, 0, 0, 0, null)));
+            }
+            final List<AlignedContig> results = aligner.align(seqs);
+            final Map<byte[], AlignedContig> mapResult = aligner.align(seqs, (b, a) -> b);
+            Assert.assertEquals(results, new ArrayList<>(mapResult.values()));
+            Assert.assertEquals(new ArrayList<>(mapResult.keySet()), mapResult.values().stream().map(AlignedContig::getContigSequence).collect(Collectors.toList()));
+            for (int i = 0; i < NUM_ALIGNS; i++) {
+                final List<AlignmentInterval> actualValue = results.get(i).getAlignments();
+                final List<AlignmentInterval> expectedValue = expected.get(i);
+                Assert.assertEquals(actualValue.size(), 1);
+                Assert.assertEquals(actualValue.get(0).forwardStrand, expectedValue.get(0).forwardStrand);
+                Assert.assertEquals(actualValue.get(0).referenceSpan, expectedValue.get(0).referenceSpan, expectedValue.get(0).cigarAlong5to3DirectionOfContig.toString());
+                Assert.assertEquals(actualValue.get(0).startInAssembledContig, expectedValue.get(0).startInAssembledContig);
+                Assert.assertEquals(actualValue.get(0).endInAssembledContig, expectedValue.get(0).endInAssembledContig);
+                final Cigar expectedCigar = expectedValue.get(0).cigarAlong5to3DirectionOfContig;
+                final Cigar actualCigar = actualValue.get(0).cigarAlong5to3DirectionOfContig;
+                if (!expectedCigar.equals(actualCigar)) { // small differences may occur due to ambiguous indel location. So we check that they are small differences indeed:
+                    Assert.assertEquals(expectedCigar.numCigarElements(), actualCigar.numCigarElements()); // same number of elements
+                    Assert.assertEquals(expectedCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList()),
+                            actualCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList())); // same operators sequence.
+                    // then we check the total lengths per operator (must be the same):
+                    final Map<CigarOperator, Integer> expectedLengthByOperator = expectedCigar.getCigarElements().stream()
+                            .collect(Collectors.groupingBy(CigarElement::getOperator,
+                                    Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b)));
+                    final Map<CigarOperator, Integer> actualLengthByOperator = actualCigar.getCigarElements().stream()
+                            .collect(Collectors.groupingBy(CigarElement::getOperator,
+                                    Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b)));
+                    Assert.assertEquals(actualLengthByOperator, expectedLengthByOperator);
+                    // finally we don't allow more than 5 bases length difference for any given element.
+                    for (int j = 0; j < expectedCigar.numCigarElements(); j++) {
+                        Assert.assertTrue(Math.abs(expectedCigar.getCigarElement(j).getLength() - actualCigar.getCigarElement(j).getLength()) < 10, "actual: " + actualCigar + " != expected: " + expectedCigar);
+                    }
+                }
+            }
+        }
+    }
+
+    @DataProvider(name="testAlignmentData")
+    public Object[][] testAlignmentData() {
+        final List<Object[]> result = new ArrayList<>();
+        final RandomDNA randomDNA = new RandomDNA(1301);
+        result.add(new Object[]{ false, randomDNA.nextBases(1000), REF_NAME});
+        result.add(new Object[]{ true, randomDNA.nextBases(10000), REF_NAME});
+        return result.toArray(new Object[result.size()][]);
+    }
+
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java
index 3670e2c09c2..45dbfb5aebd 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java
@@ -1,15 +1,13 @@
 package org.broadinstitute.hellbender.utils;
 
+import com.google.api.client.repackaged.com.google.common.annotations.VisibleForTesting;
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.SAMSequenceRecord;
-import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.utils.param.ParamUtils;
 
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.io.InvalidObjectException;
-import java.io.PrintWriter;
 import java.io.Writer;
 import java.util.Random;
 
@@ -20,11 +18,56 @@
  *     Returned bases are always in upper case and one of the valid four nucleotides 'A', 'C', 'G' and 'T'.
  * </p>
  *
+ * <p>
+ *     This class is written in a way that ensure that the sequence of bases returned would only depend on the seed and
+ *     not the sequence of calls to different base generating methods.
+ * </p>
+ * <p>
+ *     For example a single call asking for 132 bases using {@link #nextBases(int)} would generate the same sequence
+ *     as calling 132 times {@link #nextBase()} if both random instances were initialized with the same seed.
+ * </p>
+ *
  * @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.org&gt;
  */
 public final class RandomDNA {
 
-    private final Random random;
+    /**
+     * Number of bases that can be packed in a int (bits in an int divide by 2 bits per base).
+     * <p>
+     * In practice the code below would work if this constant takes any values between 3 and the actual number
+     * of bit pairs in an int (16). The maximum (16) is recommended.
+     * </p>
+     */
+    @VisibleForTesting
+    static final int BASES_IN_AN_INT = Integer.SIZE >> 1; // = 16; and is extremely unlikely to ever change.
+
+    /**
+     * Convenient constant
+     */
+    private static final int BASES_IN_AN_INT_MINUS_1 = BASES_IN_AN_INT - 1;
+
+    /**
+     * Maximum capacity of the next bytes queue.
+     *
+     * It must be at least {@link #BASES_IN_AN_INT} and ideally a multiple of that number
+     * (in order to avoid unused space in the base-buffer).
+     */
+    @VisibleForTesting
+    static final int NEXT_BASES_MAX_CAPACITY = BASES_IN_AN_INT * 64; // = 1024 bases/bytes  extremelly unlikely to ever change.
+
+    protected final Random random;
+
+    private final byte[] codeToBase = new byte[] {
+            Nucleotide.A.toBase(),
+            Nucleotide.C.toBase(),
+            Nucleotide.G.toBase(),
+            Nucleotide.T.toBase()
+    };
+
+    private final byte[] nextBases;
+    private int firstBaseIndex;
+    private int lastBaseIndex;
+    private int nextBasesSize;
 
     /**
      * Constructs a new random DNA generator.
@@ -47,6 +90,10 @@ public RandomDNA() {
      */
     public RandomDNA(final Random rnd) {
         random = Utils.nonNull(rnd, "the random number generator cannot be null");
+        nextBases = new byte[NEXT_BASES_MAX_CAPACITY];
+        firstBaseIndex = 0;
+        lastBaseIndex = -1;
+        nextBasesSize = 0;
     }
 
     /**
@@ -66,10 +113,10 @@ public RandomDNA(final long seed) {
      * </p>
      *
      * @param destination the array to update.
-     * @throws NullPointerException if {@code destination} is {@code null}.
+     * @throws IllegalArgumentException if {@code destination} is {@code null}.
      */
     public void nextBases(final byte[] destination) {
-        nextBases(destination, 0, destination.length);
+        nextBases(Utils.nonNull(destination, "the destination array must not be null"), 0, destination.length);
     }
 
     /**
@@ -87,31 +134,54 @@ public void nextBases(final byte[] dest, final int offset, final int length) {
         ParamUtils.isPositiveOrZero(length, "the input length must be positive");
         final int to = offset + length;
         ParamUtils.inRange(to, 0, dest.length, "the length provided goes beyond the end of the destination array");
-        for (int i = offset; i < to; ) {
-            int randomInt = random.nextInt();
-            for (int j = 0; i < to && j < Integer.SIZE; j += 2) {
-                final int ord = randomInt & 0x000000003;
-                randomInt >>= 2;
-                switch (ord) {
-                    case 0:
-                        dest[i++] = 'A';
-                        break;
-                    case 1:
-                        dest[i++] = 'C';
-                        break;
-                    case 2:
-                        dest[i++] = 'G';
-                        break;
-                    case 3:
-                        dest[i++] = 'T';
-                        break;
-                    default:
-                        throw new GATKException.ShouldNeverReachHereException("this cannot be happening!!! " + ord);
-                }
+        int remaining = length;
+        int nextDestIndex = offset;
+        while (remaining > 0) {
+            final int available = loadNextBases(remaining);
+            for (int i = 0; i < available; i++) {
+                dest[nextDestIndex++] = removeNextBase();
             }
+            remaining -= available;
         }
     }
 
+    /**
+     * Load new bases to the next bases queue to try to satisfy a required number of next-bases.
+     *
+     * <p>
+     *     It might be that there is not enough space in the buffer to satisfy the target, in which case this will return
+     * something close to the capacity of the buffer, expecting the caller to clear the buffer and repeat the request
+     * for the remainder.
+     * </p>
+     *
+     * @param targetSize the number of bases that the caller would like to have at its disposal.
+     * @return the amount of bases available given the current content of the next-bases queue, its max capacity
+     * and the requested target.
+     */
+    private int loadNextBases(final int targetSize) {
+        final int currentNumberOfBases = nextBasesSize;
+        if (currentNumberOfBases >= targetSize) {
+            return targetSize;
+        } else if (NEXT_BASES_MAX_CAPACITY - currentNumberOfBases < BASES_IN_AN_INT) {
+            // If we try to load more bases it would go over the max capacity.
+            return currentNumberOfBases;
+        } else  {
+            final int result = Math.min(NEXT_BASES_MAX_CAPACITY, targetSize);
+            final int increaseInInts = (result - currentNumberOfBases + BASES_IN_AN_INT_MINUS_1) / BASES_IN_AN_INT;
+            for (int i = 0; i < increaseInInts; i++) {
+                int randomInt = random.nextInt();
+                addNextBase(codeToBase[randomInt & 0x03]);
+                int j = 1;
+                do {
+                    randomInt >>>= 2;
+                    addNextBase(codeToBase[randomInt & 0x03]);
+                } while(++j < BASES_IN_AN_INT_MINUS_1);
+                randomInt >>>= 2;
+                addNextBase(codeToBase[randomInt]); // for the last base there is no need to bother with the mask.
+            }
+            return result;
+        }
+    }
 
     /**
      * Returns a random DNA sequence of bases.
@@ -130,7 +200,7 @@ public byte[] nextBases(final int size) {
     /**
      * Create a random reference and writes in FASTA format into a temporal file.
      * <p>
-     *     The output file is instanciated using {@link File#createTempFile}.
+     *     The output file is instantiated using {@link File#createTempFile}.
      * </p>
      * <p>
      *     The invoking code is responsible to manage and dispose of the output file.
@@ -194,4 +264,28 @@ public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final
             }
         }
     }
+
+    public byte nextBase() {
+        loadNextBases(1); // this never will fail to have 1 available base.
+        return removeNextBase();
+    }
+
+
+    private void addNextBase(final byte base) {
+        lastBaseIndex++;
+        if (lastBaseIndex == NEXT_BASES_MAX_CAPACITY) {
+            lastBaseIndex = 0;
+        }
+        nextBases[lastBaseIndex] = base;
+        nextBasesSize++;
+    }
+
+    private byte removeNextBase() {
+        final byte result = nextBases[firstBaseIndex++];
+        if (firstBaseIndex == NEXT_BASES_MAX_CAPACITY) {
+            firstBaseIndex = 0;
+        }
+        nextBasesSize--;
+        return result;
+    }
 }
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java
index 31803ab6096..6b4add1ce9d 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java
@@ -19,6 +19,9 @@
 import java.util.List;
 import java.util.stream.IntStream;
 
+/**
+ * Unit tests for {@link RandomDNA}.
+ */
 public final class RandomDNAUnitTest {
 
     private static final int TEST_BASES_PER_LINE = 73;
@@ -35,6 +38,101 @@ public void testRandomFasta(final SAMSequenceDictionary dict) throws IOException
         }
     }
 
+    @Test
+    public void testBufferMaxCapacity() {
+        Assert.assertTrue(RandomDNA.NEXT_BASES_MAX_CAPACITY >= RandomDNA.BASES_IN_AN_INT, "invalid value for NEXT_BASES_MAX_CAPACITY");
+    }
+
+    @Test
+    public void testBasesInIntConstantConstraints() {
+        Assert.assertTrue(RandomDNA.BASES_IN_AN_INT >= 3);
+        Assert.assertTrue(RandomDNA.BASES_IN_AN_INT <= Integer.SIZE / 2);
+    }
+
+    /**
+     * Checks that several ways to compose an array of bases actually result on the same sequence of bases
+     * given a fixed seed.
+     */
+    @Test
+    public void testSequenceEquivalenceShort() {
+        final int seed = 1311;
+        final RandomDNA rdn1 = new RandomDNA(seed);
+        final RandomDNA rdn2 = new RandomDNA(seed);
+        final RandomDNA rdn3 = new RandomDNA(seed);
+        final RandomDNA rdn4 = new RandomDNA(seed);
+        final RandomDNA rdn5 = new RandomDNA(seed);
+
+        final byte[] seq1 = rdn1.nextBases(132);
+
+        final byte[] seq2 = new byte[132];
+        for (int i = 0; i < seq2.length; i++) {
+            seq2[i] = rdn2.nextBase();
+        }
+
+        final byte[] seq3 = new byte[132];
+        rdn3.nextBases(seq3);
+
+        final byte[] seq4 = new byte[132];
+        rdn4.nextBases(seq4, 0, 100);
+        rdn4.nextBases(seq4, 100, 32);
+
+        final byte[] seq5 = new byte[132];
+        rdn5.nextBases(seq5, 0, 90);
+        for (int i = 0; i < 10; i++)
+            seq5[90 + i] = rdn5.nextBase();
+        rdn5.nextBases(seq5, 100, 10);
+        for (int i = 0; i < 22; i++) {
+            seq5[110 + i] = rdn5.nextBase();
+        }
+
+        Assert.assertEquals(seq1, seq2);
+        Assert.assertEquals(seq1, seq3);
+        Assert.assertEquals(seq1, seq4);
+        Assert.assertEquals(seq1, seq5);
+    }
+
+    @Test
+    public void testSequenceEquivalenceLong() {
+        final int seed = 1311;
+        final RandomDNA rdn1 = new RandomDNA(seed);
+        final RandomDNA rdn2 = new RandomDNA(seed);
+        final RandomDNA rdn3 = new RandomDNA(seed);
+        final RandomDNA rdn4 = new RandomDNA(seed);
+        final RandomDNA rdn5 = new RandomDNA(seed);
+
+        final byte[] seq1 = rdn1.nextBases(1320);
+
+        final byte[] seq2 = new byte[1320];
+        for (int i = 0; i < seq2.length; i++) {
+            seq2[i] = rdn2.nextBase();
+        }
+
+        final byte[] seq3 = new byte[1320];
+        rdn3.nextBases(seq3);
+
+        final byte[] seq4 = new byte[1320];
+        rdn4.nextBases(seq4, 0, 1000);
+        rdn4.nextBases(seq4, 1000, 320);
+
+        final byte[] seq5 = new byte[1320];
+        rdn5.nextBases(seq5, 0, 900);
+        for (int i = 0; i < 100; i++)
+            seq5[900 + i] = rdn5.nextBase();
+        rdn5.nextBases(seq5, 1000, 100);
+        for (int i = 0; i < 220; i++) {
+            seq5[1100 + i] = rdn5.nextBase();
+        }
+
+        for (int i = 0; i < 1320; i++) {
+            Assert.assertEquals(seq1[i], seq2[i], "" + i);
+        }
+        Assert.assertEquals(seq1, seq2);
+        Assert.assertEquals(seq1, seq3);
+        Assert.assertEquals(seq1, seq4);
+        Assert.assertEquals(seq1, seq5);
+    }
+
+
     private void assertFastaFileAndDictMatch(final File fastaFile, final int basesPerLine, final SAMSequenceDictionary dict) {
         try (final BufferedReader reader = new BufferedReader(new FileReader(fastaFile))) {
             final List<Pair<String, Nucleotide.Counter>> nameLengthAndFreqs = new ArrayList<>();
@@ -130,13 +228,6 @@ public void checkResults(final int[] results, final int n, final int m) {
         Assert.assertTrue(mean > expectedMean-2*s/Math.sqrt(n*m), "unexpected mean:" +mean);
     }
 
-    public void assertUniformity(final Nucleotide.Counter freqs) {
-        final long[] observed = new long[] { freqs.get(Nucleotide.A), freqs.get(Nucleotide.C), freqs.get(Nucleotide.G), freqs.get(Nucleotide.T)};
-        final ChiSquareTest test = new ChiSquareTest();
-        final double pValue = test.chiSquareTest(new double[] { 0.25, 0.25, 0.25, 0.25 }, observed );
-        Assert.assertTrue( pValue > 0.001);
-    }
-
     private int[] pairwiseAdd(int[] a, int[] b) {
         Utils.validateArg(a.length == b.length, "lengths must be equal");
         return IntStream.range(0, a.length).map(n -> a[n] + b[n]).toArray();