From 70adf91e1a77c0a1436281ca42b4e32c3dab1796 Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Tue, 29 May 2018 14:04:45 -0400 Subject: [PATCH] Vrr single contig reference aligner (#4780) * Added util class to align against an on-the-fly single contig reference using Bwa-Mem. The new class returns List already translated from the BwaMenAlignment object returned by the BwaMem aligner. * Some improvements in RandomDNA --- .../utils/SingleSequenceReferenceAligner.java | 211 ++++++++++++++++++ .../SingleContigReferenceAlignerUnitTest.java | 145 ++++++++++++ .../hellbender/utils/RandomDNA.java | 150 ++++++++++--- .../hellbender/utils/RandomDNAUnitTest.java | 105 ++++++++- 4 files changed, 576 insertions(+), 35 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java new file mode 100644 index 00000000000..80804acd73b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java @@ -0,0 +1,211 @@ +package org.broadinstitute.hellbender.tools.spark.sv.utils; + +import htsjdk.samtools.SAMFlag; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig; +import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.bwa.BwaMemAligner; +import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment; +import org.broadinstitute.hellbender.utils.bwa.BwaMemIndex; +import org.broadinstitute.hellbender.utils.reference.FastaReferenceWriter; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * Encompasses an aligner to a single-sequence reference. + *

+ * This is an {@link AutoCloseable} mean to be use in try-with-resource constructs. + *

+ *

+ * If you don't do so, please remember to call {@link #close} at the end to free + * resources. + *

+ */ +public class SingleSequenceReferenceAligner implements AutoCloseable { + + private final File image; + private final BwaMemAligner aligner; + private final BwaMemIndex index; + private final List refNames; + private boolean closed = false; + + @FunctionalInterface + public interface TriFunction{ + W apply(T t, U u, V v); + } + + public static final Predicate NO_FILTER = x -> true; + + private final Function> basesOf; + private final TriFunction>, List, ? extends U> alignmentOf; + private final Predicate alignmentFilter; + + public SingleSequenceReferenceAligner(final String name, final byte[] bases, + final Function> basesOf, + final TriFunction>, List, ? extends U> alignmentOf) { + this(name, bases, basesOf, alignmentOf, NO_FILTER); + } + + public SingleSequenceReferenceAligner(final String name, final byte[] bases, + final Function> basesOf, + final TriFunction>, List, ? extends U> alignmentOf, + final Predicate alignmentFilter) { + Utils.nonNull(name, "the input reference name cannot be null"); + Utils.nonNull(bases, "the input bases cannot be null"); + this.basesOf = Utils.nonNull(basesOf); + this.alignmentOf = Utils.nonNull(alignmentOf); + this.alignmentFilter = Utils.nonNull(alignmentFilter); + Utils.validate(bases.length > 0, "the reference contig bases sequence must have at least one base"); + try { + final File fasta = File.createTempFile("ssvh-temp", ".fasta"); + fasta.deleteOnExit(); + image = new File(fasta.getParentFile(), fasta.getName().replace(".fasta", ".img")); + image.deleteOnExit(); + FastaReferenceWriter.writeSingleSequenceReference(fasta.toPath(), false, false, name, null, bases); + BwaMemIndex.createIndexImageFromFastaFile(fasta.toString(), image.toString()); + fasta.delete(); // we don't need the fasta around. + index = new BwaMemIndex(image.toString()); + aligner = new BwaMemAligner(index); + } catch (final IOException ex) { + throw new GATKException("could not create index files", ex); + } + refNames = Collections.singletonList(name); + } + + /** + * Gives access to the underlying aligner so that you can modify its options. + *

+ * You could align sequences directly thru the return object but in that case you will lose the + * {@link BwaMemAlignment} to {@link AlignmentInterval} translation. + *

+ * + * @return never {@code null}. + */ + public BwaMemAligner getAligner() { + return aligner; + } + + public List align(final Iterable inputs) { + Utils.nonNull(inputs); + return align(Utils.stream(inputs).collect(Collectors.toList())); + } + + /** + * Aligns the input object returning a list of the outputs in the corresponding order. + * @param inputs + * @return + */ + public List align(final List inputs) { + checkNotClosed(); + Utils.nonNull(inputs, "the input sequence array cannot be null"); + final List> seqs = inputs.stream().map(basesOf).collect(Collectors.toList()); + final List flattenSeqs = seqs.stream().flatMap(Collection::stream).collect(Collectors.toList()); + final List> alignments = aligner.alignSeqs(flattenSeqs); + if (alignments.size() != flattenSeqs.size()) { // paranoiah?? + throw new IllegalStateException("something went terribly wrong and the number of returned alignment list does " + + "not correspond to the number of input sequences: " + alignments.size() + " != " + flattenSeqs.size()); + } + final List result = new ArrayList<>(inputs.size()); + int nextAlignmentIndex = 0; + for (int i = 0; i < inputs.size(); i++) { + final T inputObject = inputs.get(i); + final List sequences = seqs.get(i); + final List> relevantAlignments = + alignments.subList(nextAlignmentIndex, nextAlignmentIndex += sequences.size()); + final List> filteredAlignments; + if (alignmentFilter == NO_FILTER) { + filteredAlignments = relevantAlignments; + } else { + filteredAlignments = new ArrayList<>(relevantAlignments.size()); + for (int j = 0; j < relevantAlignments.size(); j++) { + filteredAlignments.add(relevantAlignments.get(j).stream().filter(alignmentFilter).collect(Collectors.toList())); + } + } + final U outputObject = alignmentOf.apply(inputObject, filteredAlignments, refNames); + result.add(outputObject); + } + return result; + } + + /** + * Composes a map of the aligned sequences. + *

+ * The key of such a map would be determined by the input object and the output alignment. + *

+ *

+ * Iterations over the entries, keys and values of the resulting tree will follow the order + * of the input objects. In case of key collisions (more than one input object, alignment result in the same key) + * then we only keep the first occurrence of such key. + *

+ * @param inputs the input objects to align. + * @param keyOf function that composed the key given the input and output objects. + * @param + * @return never {@code null}. + */ + public Map align(final List inputs, final BiFunction keyOf) { + final List outs = align(inputs); + final LinkedHashMap result = new LinkedHashMap<>(outs.size()); + for (int i = 0; i < outs.size(); i++) { + result.putIfAbsent(keyOf.apply(inputs.get(i), outs.get(i)), outs.get(i)); + } + return result; + } + + /** + * Composes a contig aligner from an arbitrary input type given contig name and base sequence generation functions. + *

+ * Supplementary alignments will be ignored. + *

+ * + * @param refName the name of the only reference sequence. + * @param refBases the bases for the only reference sequence. + * @param nameOf function that produces the aligned contig name based on the input object. + * @param basesOf function that produces the contig bases sequence based on the input object. + * @param the type-parameter of the input object. + * @return never {@code null}. + */ + public static SingleSequenceReferenceAligner contigsAligner(final String refName, final byte[] refBases, + final Function nameOf, + final Function basesOf) { + return new SingleSequenceReferenceAligner<>(refName, refBases, + t -> Collections.singletonList(basesOf.apply(t)), + (t, bma, refNames) -> { + final String name = nameOf.apply(t); + final byte[] bases = basesOf.apply(t); + final List intervals = bma.get(0).stream().map(b -> new AlignmentInterval(b, refNames, bases.length)).collect(Collectors.toList()); + return new AlignedContig(name, bases, intervals); + }, + bma -> bma.getRefId() >= 0 && SAMFlag.SECONDARY_ALIGNMENT.isUnset(bma.getSamFlag())); + } + + private void checkNotClosed() { + if (closed) { + throw new IllegalStateException("operation not allowed once the aligner is closed"); + } + } + + @Override + public void close() throws IOException { + if (!closed) { + aligner.close(); + closed = true; + try { + index.close(); + } finally { + image.delete(); + } + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java new file mode 100644 index 00000000000..fbd20d012d0 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleContigReferenceAlignerUnitTest.java @@ -0,0 +1,145 @@ +package org.broadinstitute.hellbender.tools.spark.sv.utils; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.TextCigarCodec; +import htsjdk.samtools.util.SequenceUtil; +import org.apache.commons.math3.util.Pair; +import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig; +import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval; +import org.broadinstitute.hellbender.utils.RandomDNA; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.read.CigarUtils; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.stream.Collectors; + +/** + * Unit tests for {@link SingleSequenceReferenceAligner} + */ +public class SingleContigReferenceAlignerUnitTest extends BaseTest { + + private static final int READ_LENGTH = 250; + private static final int NUM_ALIGNS = 1000; + private static final String REF_NAME = "ref00"; + + @Test(expectedExceptions = IllegalStateException.class, expectedExceptionsMessageRegExp = "^.*closed.*$") + public void testClosedException() { + final RandomDNA rdn = new RandomDNA(131); + final byte[] refBases = rdn.nextBases(1000); + final SingleSequenceReferenceAligner aligner = SingleSequenceReferenceAligner.contigsAligner("test", refBases, + a -> "contig", a -> a); + try { + aligner.close(); + } catch (final IOException ex) { + Assert.fail("unexpected exception when closing"); + } + aligner.align(Collections.singletonList(Arrays.copyOfRange(refBases, 100, 200))); + } + + + @Test(dataProvider = "testAlignmentData") + public void testAlignment(final boolean pairAlignment, final byte[] reference, final String referenceName) + throws IOException + { + try (final SingleSequenceReferenceAligner aligner = SingleSequenceReferenceAligner.contigsAligner(referenceName, reference, + a -> "ctg", a -> a);) { + Assert.assertNotNull(aligner.getAligner()); + if (pairAlignment) { + aligner.getAligner().alignPairs(); + } + final Random rdn = new Random(13111); + final RandomDNA rdnDNA = new RandomDNA(rdn); + final List> expected = new ArrayList<>(NUM_ALIGNS << 1); + final List seqs = new ArrayList<>(NUM_ALIGNS << 1); + for (int i = 0; i < NUM_ALIGNS; i++) { + final int start = rdn.nextInt(reference.length - READ_LENGTH) + 1; + final boolean forward = rdn.nextBoolean(); + final boolean insert = rdn.nextDouble() < 0.1; + final boolean deletion = !insert && rdn.nextDouble() < 0.1; + final int indelLength = insert || deletion ? rdn.nextInt(10) + 10 : 0; + final int indelStart = insert || deletion ? rdn.nextInt((int) (READ_LENGTH * .50)) + 25 : -1; + final int end = start + READ_LENGTH - 1; + final byte[] templateSeq = Arrays.copyOfRange(reference, start - 1, end); + final byte[] actualSeq; + if (insert) { + actualSeq = Arrays.copyOf(templateSeq, templateSeq.length + indelLength); + System.arraycopy(actualSeq, indelStart - 1, actualSeq, indelStart - 1 + indelLength, templateSeq.length - indelStart + 1); + rdnDNA.nextBases(actualSeq, indelStart - 1, indelLength); + } else if (deletion) { + actualSeq = Arrays.copyOf(templateSeq, templateSeq.length - indelLength); + System.arraycopy(templateSeq, indelStart - 1 + indelLength, actualSeq, indelStart - 1, templateSeq.length - indelStart + 1 - indelLength); + } else { + actualSeq = templateSeq.clone(); + } + + while (insert && actualSeq[indelStart - 1] == actualSeq[indelStart + indelLength - 1]) { + actualSeq[indelStart + indelLength - 1] = rdnDNA.nextBase(); + } + if (!forward) { + SequenceUtil.reverseComplement(actualSeq); + } + seqs.add(actualSeq); + final Cigar cigar = (!insert && !deletion) ? TextCigarCodec.decode(READ_LENGTH + "M"): + (insert ? TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "I" + (READ_LENGTH - indelStart + 1) + "M") + : TextCigarCodec.decode( "" + (indelStart - 1) + "M" + indelLength + "D" + (READ_LENGTH - indelStart + 1 - indelLength) + "M")); + + expected.add(Collections.singletonList(new AlignmentInterval(new SimpleInterval(referenceName, start, end), 1, actualSeq.length, !forward ? CigarUtils.invertCigar(cigar) : cigar + , forward, 0, 0, 0, null))); + } + final List results = aligner.align(seqs); + final Map mapResult = aligner.align(seqs, (b, a) -> b); + Assert.assertEquals(results, new ArrayList<>(mapResult.values())); + Assert.assertEquals(new ArrayList<>(mapResult.keySet()), mapResult.values().stream().map(AlignedContig::getContigSequence).collect(Collectors.toList())); + for (int i = 0; i < NUM_ALIGNS; i++) { + final List actualValue = results.get(i).getAlignments(); + final List expectedValue = expected.get(i); + Assert.assertEquals(actualValue.size(), 1); + Assert.assertEquals(actualValue.get(0).forwardStrand, expectedValue.get(0).forwardStrand); + Assert.assertEquals(actualValue.get(0).referenceSpan, expectedValue.get(0).referenceSpan, expectedValue.get(0).cigarAlong5to3DirectionOfContig.toString()); + Assert.assertEquals(actualValue.get(0).startInAssembledContig, expectedValue.get(0).startInAssembledContig); + Assert.assertEquals(actualValue.get(0).endInAssembledContig, expectedValue.get(0).endInAssembledContig); + final Cigar expectedCigar = expectedValue.get(0).cigarAlong5to3DirectionOfContig; + final Cigar actualCigar = actualValue.get(0).cigarAlong5to3DirectionOfContig; + if (!expectedCigar.equals(actualCigar)) { // small differences may occur due to ambiguous indel location. So we check that they are small differences indeed: + Assert.assertEquals(expectedCigar.numCigarElements(), actualCigar.numCigarElements()); // same number of elements + Assert.assertEquals(expectedCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList()), + actualCigar.getCigarElements().stream().map(CigarElement::getOperator).collect(Collectors.toList())); // same operators sequence. + // then we check the total lengths per operator (must be the same): + final Map expectedLengthByOperator = expectedCigar.getCigarElements().stream() + .collect(Collectors.groupingBy(CigarElement::getOperator, + Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b))); + final Map actualLengthByOperator = actualCigar.getCigarElements().stream() + .collect(Collectors.groupingBy(CigarElement::getOperator, + Collectors.reducing(0, CigarElement::getLength, (a, b) -> a + b))); + Assert.assertEquals(actualLengthByOperator, expectedLengthByOperator); + // finally we don't allow more than 5 bases length difference for any given element. + for (int j = 0; j < expectedCigar.numCigarElements(); j++) { + Assert.assertTrue(Math.abs(expectedCigar.getCigarElement(j).getLength() - actualCigar.getCigarElement(j).getLength()) < 10, "actual: " + actualCigar + " != expected: " + expectedCigar); + } + } + } + } + } + + @DataProvider(name="testAlignmentData") + public Object[][] testAlignmentData() { + final List result = new ArrayList<>(); + final RandomDNA randomDNA = new RandomDNA(1301); + result.add(new Object[]{ false, randomDNA.nextBases(1000), REF_NAME}); + result.add(new Object[]{ true, randomDNA.nextBases(10000), REF_NAME}); + return result.toArray(new Object[result.size()][]); + } + +} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java index 3670e2c09c2..45dbfb5aebd 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNA.java @@ -1,15 +1,13 @@ package org.broadinstitute.hellbender.utils; +import com.google.api.client.repackaged.com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.utils.param.ParamUtils; import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.io.InvalidObjectException; -import java.io.PrintWriter; import java.io.Writer; import java.util.Random; @@ -20,11 +18,56 @@ * Returned bases are always in upper case and one of the valid four nucleotides 'A', 'C', 'G' and 'T'. *

* + *

+ * This class is written in a way that ensure that the sequence of bases returned would only depend on the seed and + * not the sequence of calls to different base generating methods. + *

+ *

+ * For example a single call asking for 132 bases using {@link #nextBases(int)} would generate the same sequence + * as calling 132 times {@link #nextBase()} if both random instances were initialized with the same seed. + *

+ * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ public final class RandomDNA { - private final Random random; + /** + * Number of bases that can be packed in a int (bits in an int divide by 2 bits per base). + *

+ * In practice the code below would work if this constant takes any values between 3 and the actual number + * of bit pairs in an int (16). The maximum (16) is recommended. + *

+ */ + @VisibleForTesting + static final int BASES_IN_AN_INT = Integer.SIZE >> 1; // = 16; and is extremely unlikely to ever change. + + /** + * Convenient constant + */ + private static final int BASES_IN_AN_INT_MINUS_1 = BASES_IN_AN_INT - 1; + + /** + * Maximum capacity of the next bytes queue. + * + * It must be at least {@link #BASES_IN_AN_INT} and ideally a multiple of that number + * (in order to avoid unused space in the base-buffer). + */ + @VisibleForTesting + static final int NEXT_BASES_MAX_CAPACITY = BASES_IN_AN_INT * 64; // = 1024 bases/bytes extremelly unlikely to ever change. + + protected final Random random; + + private final byte[] codeToBase = new byte[] { + Nucleotide.A.toBase(), + Nucleotide.C.toBase(), + Nucleotide.G.toBase(), + Nucleotide.T.toBase() + }; + + private final byte[] nextBases; + private int firstBaseIndex; + private int lastBaseIndex; + private int nextBasesSize; /** * Constructs a new random DNA generator. @@ -47,6 +90,10 @@ public RandomDNA() { */ public RandomDNA(final Random rnd) { random = Utils.nonNull(rnd, "the random number generator cannot be null"); + nextBases = new byte[NEXT_BASES_MAX_CAPACITY]; + firstBaseIndex = 0; + lastBaseIndex = -1; + nextBasesSize = 0; } /** @@ -66,10 +113,10 @@ public RandomDNA(final long seed) { *

* * @param destination the array to update. - * @throws NullPointerException if {@code destination} is {@code null}. + * @throws IllegalArgumentException if {@code destination} is {@code null}. */ public void nextBases(final byte[] destination) { - nextBases(destination, 0, destination.length); + nextBases(Utils.nonNull(destination, "the destination array must not be null"), 0, destination.length); } /** @@ -87,31 +134,54 @@ public void nextBases(final byte[] dest, final int offset, final int length) { ParamUtils.isPositiveOrZero(length, "the input length must be positive"); final int to = offset + length; ParamUtils.inRange(to, 0, dest.length, "the length provided goes beyond the end of the destination array"); - for (int i = offset; i < to; ) { - int randomInt = random.nextInt(); - for (int j = 0; i < to && j < Integer.SIZE; j += 2) { - final int ord = randomInt & 0x000000003; - randomInt >>= 2; - switch (ord) { - case 0: - dest[i++] = 'A'; - break; - case 1: - dest[i++] = 'C'; - break; - case 2: - dest[i++] = 'G'; - break; - case 3: - dest[i++] = 'T'; - break; - default: - throw new GATKException.ShouldNeverReachHereException("this cannot be happening!!! " + ord); - } + int remaining = length; + int nextDestIndex = offset; + while (remaining > 0) { + final int available = loadNextBases(remaining); + for (int i = 0; i < available; i++) { + dest[nextDestIndex++] = removeNextBase(); } + remaining -= available; } } + /** + * Load new bases to the next bases queue to try to satisfy a required number of next-bases. + * + *

+ * It might be that there is not enough space in the buffer to satisfy the target, in which case this will return + * something close to the capacity of the buffer, expecting the caller to clear the buffer and repeat the request + * for the remainder. + *

+ * + * @param targetSize the number of bases that the caller would like to have at its disposal. + * @return the amount of bases available given the current content of the next-bases queue, its max capacity + * and the requested target. + */ + private int loadNextBases(final int targetSize) { + final int currentNumberOfBases = nextBasesSize; + if (currentNumberOfBases >= targetSize) { + return targetSize; + } else if (NEXT_BASES_MAX_CAPACITY - currentNumberOfBases < BASES_IN_AN_INT) { + // If we try to load more bases it would go over the max capacity. + return currentNumberOfBases; + } else { + final int result = Math.min(NEXT_BASES_MAX_CAPACITY, targetSize); + final int increaseInInts = (result - currentNumberOfBases + BASES_IN_AN_INT_MINUS_1) / BASES_IN_AN_INT; + for (int i = 0; i < increaseInInts; i++) { + int randomInt = random.nextInt(); + addNextBase(codeToBase[randomInt & 0x03]); + int j = 1; + do { + randomInt >>>= 2; + addNextBase(codeToBase[randomInt & 0x03]); + } while(++j < BASES_IN_AN_INT_MINUS_1); + randomInt >>>= 2; + addNextBase(codeToBase[randomInt]); // for the last base there is no need to bother with the mask. + } + return result; + } + } /** * Returns a random DNA sequence of bases. @@ -130,7 +200,7 @@ public byte[] nextBases(final int size) { /** * Create a random reference and writes in FASTA format into a temporal file. *

- * The output file is instanciated using {@link File#createTempFile}. + * The output file is instantiated using {@link File#createTempFile}. *

*

* The invoking code is responsible to manage and dispose of the output file. @@ -194,4 +264,28 @@ public void nextFasta(final Writer out, final SAMSequenceDictionary dict, final } } } + + public byte nextBase() { + loadNextBases(1); // this never will fail to have 1 available base. + return removeNextBase(); + } + + + private void addNextBase(final byte base) { + lastBaseIndex++; + if (lastBaseIndex == NEXT_BASES_MAX_CAPACITY) { + lastBaseIndex = 0; + } + nextBases[lastBaseIndex] = base; + nextBasesSize++; + } + + private byte removeNextBase() { + final byte result = nextBases[firstBaseIndex++]; + if (firstBaseIndex == NEXT_BASES_MAX_CAPACITY) { + firstBaseIndex = 0; + } + nextBasesSize--; + return result; + } } diff --git a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java index 31803ab6096..6b4add1ce9d 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/RandomDNAUnitTest.java @@ -19,6 +19,9 @@ import java.util.List; import java.util.stream.IntStream; +/** + * Unit tests for {@link RandomDNA}. + */ public final class RandomDNAUnitTest { private static final int TEST_BASES_PER_LINE = 73; @@ -35,6 +38,101 @@ public void testRandomFasta(final SAMSequenceDictionary dict) throws IOException } } + @Test + public void testBufferMaxCapacity() { + Assert.assertTrue(RandomDNA.NEXT_BASES_MAX_CAPACITY >= RandomDNA.BASES_IN_AN_INT, "invalid value for NEXT_BASES_MAX_CAPACITY"); + } + + @Test + public void testBasesInIntConstantConstraints() { + Assert.assertTrue(RandomDNA.BASES_IN_AN_INT >= 3); + Assert.assertTrue(RandomDNA.BASES_IN_AN_INT <= Integer.SIZE / 2); + } + + /** + * Checks that several ways to compose an array of bases actually result on the same sequence of bases + * given a fixed seed. + */ + @Test + public void testSequenceEquivalenceShort() { + final int seed = 1311; + final RandomDNA rdn1 = new RandomDNA(seed); + final RandomDNA rdn2 = new RandomDNA(seed); + final RandomDNA rdn3 = new RandomDNA(seed); + final RandomDNA rdn4 = new RandomDNA(seed); + final RandomDNA rdn5 = new RandomDNA(seed); + + final byte[] seq1 = rdn1.nextBases(132); + + final byte[] seq2 = new byte[132]; + for (int i = 0; i < seq2.length; i++) { + seq2[i] = rdn2.nextBase(); + } + + final byte[] seq3 = new byte[132]; + rdn3.nextBases(seq3); + + final byte[] seq4 = new byte[132]; + rdn4.nextBases(seq4, 0, 100); + rdn4.nextBases(seq4, 100, 32); + + final byte[] seq5 = new byte[132]; + rdn5.nextBases(seq5, 0, 90); + for (int i = 0; i < 10; i++) + seq5[90 + i] = rdn5.nextBase(); + rdn5.nextBases(seq5, 100, 10); + for (int i = 0; i < 22; i++) { + seq5[110 + i] = rdn5.nextBase(); + } + + Assert.assertEquals(seq1, seq2); + Assert.assertEquals(seq1, seq3); + Assert.assertEquals(seq1, seq4); + Assert.assertEquals(seq1, seq5); + } + + @Test + public void testSequenceEquivalenceLong() { + final int seed = 1311; + final RandomDNA rdn1 = new RandomDNA(seed); + final RandomDNA rdn2 = new RandomDNA(seed); + final RandomDNA rdn3 = new RandomDNA(seed); + final RandomDNA rdn4 = new RandomDNA(seed); + final RandomDNA rdn5 = new RandomDNA(seed); + + final byte[] seq1 = rdn1.nextBases(1320); + + final byte[] seq2 = new byte[1320]; + for (int i = 0; i < seq2.length; i++) { + seq2[i] = rdn2.nextBase(); + } + + final byte[] seq3 = new byte[1320]; + rdn3.nextBases(seq3); + + final byte[] seq4 = new byte[1320]; + rdn4.nextBases(seq4, 0, 1000); + rdn4.nextBases(seq4, 1000, 320); + + final byte[] seq5 = new byte[1320]; + rdn5.nextBases(seq5, 0, 900); + for (int i = 0; i < 100; i++) + seq5[900 + i] = rdn5.nextBase(); + rdn5.nextBases(seq5, 1000, 100); + for (int i = 0; i < 220; i++) { + seq5[1100 + i] = rdn5.nextBase(); + } + + for (int i = 0; i < 1320; i++) { + Assert.assertEquals(seq1[i], seq2[i], "" + i); + } + Assert.assertEquals(seq1, seq2); + Assert.assertEquals(seq1, seq3); + Assert.assertEquals(seq1, seq4); + Assert.assertEquals(seq1, seq5); + } + + private void assertFastaFileAndDictMatch(final File fastaFile, final int basesPerLine, final SAMSequenceDictionary dict) { try (final BufferedReader reader = new BufferedReader(new FileReader(fastaFile))) { final List> nameLengthAndFreqs = new ArrayList<>(); @@ -130,13 +228,6 @@ public void checkResults(final int[] results, final int n, final int m) { Assert.assertTrue(mean > expectedMean-2*s/Math.sqrt(n*m), "unexpected mean:" +mean); } - public void assertUniformity(final Nucleotide.Counter freqs) { - final long[] observed = new long[] { freqs.get(Nucleotide.A), freqs.get(Nucleotide.C), freqs.get(Nucleotide.G), freqs.get(Nucleotide.T)}; - final ChiSquareTest test = new ChiSquareTest(); - final double pValue = test.chiSquareTest(new double[] { 0.25, 0.25, 0.25, 0.25 }, observed ); - Assert.assertTrue( pValue > 0.001); - } - private int[] pairwiseAdd(int[] a, int[] b) { Utils.validateArg(a.length == b.length, "lengths must be equal"); return IntStream.range(0, a.length).map(n -> a[n] + b[n]).toArray();