v1.3.6

hartleys · Sep 25, 2018 · 099881f · 099881f
1 parent 39cd1fc
commit 099881f
Show file tree

Hide file tree

Showing 20 changed files with 1,858 additions and 97 deletions.
diff --git a/QoRTs-vignette.pdf b/QoRTs-vignette.pdf
diff --git a/QoRTs.jar b/QoRTs.jar
diff --git a/QoRTs_1.3.6.tar.gz b/QoRTs_1.3.6.tar.gz
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-# QoRTs v1.3.0
-(Compiled Fri Oct 20 11:56:37 EDT 2017)
+# QoRTs v1.3.6
+(Compiled Tue Sep 25 11:21:46 EDT 2018)
 
 The [QoRTs software package](http://hartleys.github.io/QoRTs/) is a fast, efficient, and portable 
 multifunction toolkit designed to assist in

diff --git a/example-walkthrough.pdf b/example-walkthrough.pdf
diff --git a/src/HartleyUtils/assembleHSU.sh b/src/HartleyUtils/assembleHSU.sh
@@ -20,3 +20,5 @@ export _JAVA_OPTIONS="-Xmx1000M -Xms500M -XX:ParallelGCThreads=1"
 export MALLOC_ARENA_MAX=1
 
 sbt < sbtAssemblyCommand.txt
+
+cp target/scala-2.12/QoRTs.jar ./
diff --git a/src/HartleyUtils/build.sbt b/src/HartleyUtils/build.sbt
@@ -1,12 +1,10 @@
-import AssemblyKeys._
 
-assemblySettings
 
 scalaVersion := "2.12.4"
 
-scalacOptions := Seq("-unchecked", "-deprecation")
+scalacOptions := Seq("-unchecked", "-deprecation", "-feature")
 
-jarName in assembly := "QoRTs.jar"
+assemblyJarName in assembly := "QoRTs.jar"
 
 mainClass in assembly := Some("runner.runner")
 
diff --git a/src/HartleyUtils/project/assembly.sbt b/src/HartleyUtils/project/assembly.sbt
@@ -0,0 +1,3 @@
+
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
+
diff --git a/src/HartleyUtils/project/build.properties b/src/HartleyUtils/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.1.4
diff --git a/src/HartleyUtils/project/plugins.sbt b/src/HartleyUtils/project/plugins.sbt
@@ -1,3 +1,3 @@
 resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
+//addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
diff --git a/src/HartleyUtils/sbtAssemblyCommand.txt b/src/HartleyUtils/sbtAssemblyCommand.txt
@@ -1 +1,4 @@
 assembly
+
+exit
+
diff --git a/src/HartleyUtils/src/main/scala/fileConversionUtils/filterBamFile.scala b/src/HartleyUtils/src/main/scala/fileConversionUtils/filterBamFile.scala
diff --git a/src/HartleyUtils/src/main/scala/internalUtils/Reporter.scala b/src/HartleyUtils/src/main/scala/internalUtils/Reporter.scala
@@ -40,6 +40,7 @@ object Reporter {
       }
     }
 
+
     /*
      * Do not use the below functions:
      */
@@ -238,6 +239,31 @@ object Reporter {
     }
   }
 
+  var PROGRESS_NEEDS_NEWLINE = true;
+  def progressReport(i : Int, s : String, verb : String = "progress"){
+    report("] " + (s +: getProgressReportStrings(i).map{s => "      "+s} ).mkString("\n")+"\n",verb=verb);
+  }
+  def progressDot(i : Int, dotsPerGroup : Int = 5, groupsPerLine : Int = 4, blankSpacer : String = "-", verb : String = "progress"){
+    if(PROGRESS_NEEDS_NEWLINE){
+      startProgressLine(i-1,dotsPerGroup = dotsPerGroup, groupsPerLine = groupsPerLine, verb=verb,spacer = blankSpacer);
+      PROGRESS_NEEDS_NEWLINE = false;
+    }
+    if(i != (groupsPerLine * dotsPerGroup) && i % dotsPerGroup == 0){
+      report(". ",verb);
+    } else {
+      report(".",verb);
+    }
+
+  }
+  def startProgressLine(blankSpaces : Int, spacer : String = "x", groupSpacer : String = " ", dotsPerGroup : Int = 5, groupsPerLine : Int = 4, verb : String = "progress"){
+    var out = "[";
+    var grpNum = blankSpaces / dotsPerGroup;
+    out = out + stdUtils.repString(stdUtils.repString(spacer,dotsPerGroup)+groupSpacer,grpNum);
+    out = out + stdUtils.repString(spacer,blankSpaces % dotsPerGroup);
+    report(out,verb);
+  }
+
+
   /*
    * Reporting options:
    */
@@ -285,6 +311,17 @@ object Reporter {
     }
     warningCount(warnType) += 1;
   }
+  val noticeCount = scala.collection.mutable.Map[String,Int]().withDefault((x : String) => 0);
+  def notice(str : String, warnType : String = "default", limit : Int = -1){
+    val warnCt = noticeCount(warnType)
+    if(limit < 0 || warnCt <= limit){
+      PROGRESS_NEEDS_NEWLINE = true;
+      reportln("  #### NOTE ("+warnType+" "+(warnCt+1)+"):","warn");
+      reportln("    >> "+str.split("\n").mkString("\n    >> "),"warn");
+      if(limit > 0 && warnCt == limit) reportln("       (("+limit+"+ notices of type "+warnType+". Further warnings of this type will be silent.))","warn");
+    }
+    noticeCount(warnType) += 1;
+  }
 
   def error(str : String){
     reportln("<====== FATAL ERROR! ======>","error");
@@ -320,7 +357,31 @@ object Reporter {
       })
       reportln("<------->","warn");
     }
+    if(! noticeCount.keys.isEmpty){
+      reportln("<------->","note");
+      reportln("   Note: "+noticeCount.keySet.map(noticeCount(_)).sum+" Notices Thrown:","note");
+      noticeCount.keySet.foreach(x => {
+        reportln("   "+noticeCount(x)+"\t"+x,"note");
+      })
+      reportln("<------->","note");
+    }  
     loggers.map((logger) => logger.close);
     loggers = List();
   }
+
+  var PROGRESS_REPORT_FUNCTIONS : Vector[(Int) => String] = Vector();
+
+  def getProgressReportStrings(i : Int) : Vector[String] = {
+    PROGRESS_REPORT_FUNCTIONS.map{f => {
+      f(i)
+    }}
+  }
+  def addProgressReportFunction(f : (Int) => String){
+    PROGRESS_REPORT_FUNCTIONS = PROGRESS_REPORT_FUNCTIONS :+ f;
+  }
+  def clearProgressReportFunctions(){
+    PROGRESS_REPORT_FUNCTIONS = Vector()
+  }
+
+
 }
diff --git a/src/HartleyUtils/src/main/scala/internalUtils/commonSeqUtils.scala b/src/HartleyUtils/src/main/scala/internalUtils/commonSeqUtils.scala
@@ -850,33 +850,35 @@ object commonSeqUtils {
 
   //}
 
-  def samRecordPairIterator_resorted(iter : Iterator[SAMRecord], verbose : Boolean = true, testCutoff : Int = -1, ignoreSecondary : Boolean = true) : Iterator[(SAMRecord,SAMRecord)] = {
-
+  def samRecordPairIterator_resorted(iter : Iterator[SAMRecord], verbose : Boolean = true, testCutoff : Int = -1, ignoreSecondary : Boolean = true, prefilterImproperPairs : Boolean = false) : Iterator[(SAMRecord,SAMRecord)] = {
+    val keepImproperPairs : Boolean = ! prefilterImproperPairs;
     if(ignoreSecondary){
        presetProgressReporters.wrapIterator_readPairs(getSRPairIterResorted(iter.filter((read : SAMRecord) => {
-           (! read.getNotPrimaryAlignmentFlag()) && (! getMateUnmappedFlag(read)) && (! read.getReadUnmappedFlag())
+           (keepImproperPairs || (read.getReadPairedFlag() && read.getProperPairFlag())) && (! read.getNotPrimaryAlignmentFlag()) && (! getMateUnmappedFlag(read)) && (! read.getReadUnmappedFlag())
          })), verbose, testCutoff);
     } else {
       error("FATAL ERROR: Using non-primary read mappings is not currently implemented!");
       return null;
     }
   }
 
-  def samRecordPairIterator_unsorted(iter : Iterator[SAMRecord], verbose : Boolean = true, testCutoff : Int = -1, ignoreSecondary : Boolean = true) : Iterator[(SAMRecord,SAMRecord)] = {
-
+  def samRecordPairIterator_unsorted(iter : Iterator[SAMRecord], verbose : Boolean = true, testCutoff : Int = -1, ignoreSecondary : Boolean = true, prefilterImproperPairs : Boolean = false) : Iterator[(SAMRecord,SAMRecord)] = {
+    val keepImproperPairs : Boolean = ! prefilterImproperPairs;
     if(ignoreSecondary){
        presetProgressReporters.wrapIterator_readPairs(getSRPairIterUnsorted(iter.filter((read : SAMRecord) => {
-           (! read.getNotPrimaryAlignmentFlag()) && (! getMateUnmappedFlag(read)) && (! read.getReadUnmappedFlag())
+           (keepImproperPairs || (read.getReadPairedFlag() && read.getProperPairFlag())) && (! read.getNotPrimaryAlignmentFlag()) && (! getMateUnmappedFlag(read)) && (! read.getReadUnmappedFlag())
          })), verbose, testCutoff);
     } else {
       error("FATAL ERROR: Using non-primary read mappings is not currently implemented!");
       return null;
     }
   }
-  def samRecordPairIterator_withMulti(iter : Iterator[SAMRecord], verbose : Boolean = true, testCutoff : Int = -1, ignoreSecondary : Boolean = true) : Iterator[(SAMRecord,SAMRecord)] = {
+  def samRecordPairIterator_withMulti(iter : Iterator[SAMRecord], verbose : Boolean = true, testCutoff : Int = -1, ignoreSecondary : Boolean = true, prefilterImproperPairs : Boolean = false) : Iterator[(SAMRecord,SAMRecord)] = {
+    val keepImproperPairs : Boolean = ! prefilterImproperPairs;
+
     if(ignoreSecondary){
       presetProgressReporters.wrapIterator_readPairs(getSRPairIter(iter.filter((read : SAMRecord) => {
-        (! read.getNotPrimaryAlignmentFlag()) && (! getMateUnmappedFlag(read)) && (! read.getReadUnmappedFlag())
+        (keepImproperPairs || (read.getReadPairedFlag() && read.getProperPairFlag())) && (! read.getNotPrimaryAlignmentFlag()) && (! getMateUnmappedFlag(read)) && (! read.getReadUnmappedFlag())
       })), verbose, testCutoff);
     } else {
       error("FATAL ERROR: Using non-primary read mappings is not currently implemented!");
@@ -986,6 +988,12 @@ object commonSeqUtils {
 
         while((! pairContainer.contains(curr.getReadName())) && iter.hasNext) {
           pairContainer(curr.getReadName()) = curr;
+          if(curr.getReadPairedFlag() && (! curr.getProperPairFlag())){
+                  warning("NOTE: qorts has detected a read flagged \"improperly paired\"! Some aligners allow \n"+
+                           "paired reads to match to different chromosomes, \n"+
+                           "which can cause errors in QoRTs as it attempts to perform a pairwise coordinate sort. \n"+
+                           "You might need to rerun QoRTs with the --prefilterImproperPairs parameter!","IMPROPER_PAIR",1);
+          }
           if(pairContainerWarningSize < pairContainer.size){
               reportln("NOTE: Unmatched Read Buffer Size > "+pairContainerWarningSize+" [Mem usage:"+MemoryUtil.memInfo+"]","note");
               if(pairContainerWarningSize == initialPairContainerWarningSize){
@@ -1016,7 +1024,7 @@ object commonSeqUtils {
   }
 
 
-  private def getSRPairIterResorted(iter : Iterator[SAMRecord]) : Iterator[(SAMRecord,SAMRecord)] = {
+  private def getSRPairIterResorted(iter : Iterator[SAMRecord], strict : Boolean = true) : Iterator[(SAMRecord,SAMRecord)] = {
     reportln("Starting getSRPairIterResorted...","debug");
 
     val initialPairContainerWarningSize = 100000;
@@ -1037,6 +1045,7 @@ object commonSeqUtils {
         def bufferHasNext : Boolean = iter.hasNext;
         def addNextPairToBuffer {
           var curr = iter.next;
+          var searchCt = 0;
           while((! pairContainer.contains(curr.getReadName())) && iter.hasNext) {
             readOrder = readOrder :+ curr.getReadName();
             pairContainer(curr.getReadName()) = curr;
@@ -1054,11 +1063,26 @@ object commonSeqUtils {
                 }
                 pairContainerWarningSize = pairContainerWarningSize * warningSizeMultiplier;
             }
+            searchCt = searchCt + 1;
             curr = iter.next;            
           }
 
-          if(! pairContainer.contains(curr.getReadName()) ){
-            internalUtils.Reporter.error("ERROR ERROR ERROR: Reached end of bam file, there are "+(pairContainer.size+1)+" orphaned reads, which are marked as having a mapped pair, but no corresponding pair is found in the bam file. \n(Example Orphaned Read Name: "+curr.getReadName()+")");
+          if((! pairContainer.contains(curr.getReadName())) ){
+            if(strict){
+              internalUtils.Reporter.error("ERROR ERROR ERROR  (636): Reached end of bam file, there are "+(pairContainer.size+1)+" orphaned reads, "+
+                                           "which are marked as having a mapped pair, but no "+
+                                           "corresponding pair is found in the bam file. \n"+
+                                           "(Example Orphaned Read Name: "+curr.getReadName()+")\n"+
+                                           "(Read line: "+curr.getSAMString()+")"
+                                           );
+            } else {
+              internalUtils.Reporter.warning("ERROR ERROR ERROR  (636): Reached end of bam file, there are "+(pairContainer.size+1)+" orphaned reads, "+
+                                           "which are marked as having a mapped pair, but no "+
+                                           "corresponding pair is found in the bam file. \n"+
+                                           "(Example Orphaned Read Name: "+curr.getReadName()+")\n"+
+                                           "(Read line: "+curr.getSAMString()+")","UNPAIRED_READ",-1
+                                           );
+              }
           }
           val rB = pairContainer.remove(curr.getReadName()).get;
 
@@ -1074,11 +1098,22 @@ object commonSeqUtils {
           val nextName = readOrder.head;
           readOrder = readOrder.tail;
           if(buffer.contains(nextName)) return buffer.remove(nextName).get;
+          var searchCt = 0;
 
           while(iter.hasNext && (! buffer.contains(nextName))){
             addNextPairToBuffer;
             if(bufferWarningSize < buffer.size){
-                reportln("NOTE: Unmatched Read-PAIR-Buffer Size > "+bufferWarningSize+" [Mem usage:"+MemoryUtil.memInfo+"]","note");
+                val nextSamRead = pairContainer(nextName);
+                reportln("NOTE: Unsorted Read-PAIR-Buffer Size > "+bufferWarningSize+" [Mem usage:"+MemoryUtil.memInfo+"]\n"+
+                         "  Currently searching for read: " + nextName + " for "+searchCt + " iterations."+
+                         "  Searching for read: "+nextName+" "+nextSamRead.getReferenceName()+":"+nextSamRead.getAlignmentStart()+"-"+nextSamRead.getAlignmentEnd+" "+nextSamRead.getFlags()+"\n"+
+                         "  Current unmatched-pair-buffer status: "+pairContainer.size,"note");
+                if( nextSamRead.getReadPairedFlag() && (! nextSamRead.getProperPairFlag()) ){
+                  reportln("NOTE: current-search read is flagged \"improperly paired\"! Some aligners allow \n"+
+                           "paired reads to match to different chromosomes, \n"+
+                           "which can cause errors in QoRTs as it attempts to perform a pairwise coordinate sort. \n"+
+                           "You might need to rerun QoRTs with the --prefilterImproperPairs parameter!","warn");
+                }
                 if(bufferWarningSize == initialPairContainerWarningSize){
                   reportln("    (This is generally not a problem, but if this increases further then OutOfMemoryExceptions\n"+
                            "    may occur.\n"+
@@ -1091,6 +1126,7 @@ object commonSeqUtils {
                 }
                 bufferWarningSize = bufferWarningSize * warningSizeMultiplier;
             }
+            searchCt = searchCt + 1;
           }
           if(!buffer.contains(nextName)){
             internalUtils.Reporter.error("ERROR ERROR ERROR: Reached end of bam file, there are "+(pairContainer.size+1)+" orphaned reads, which are marked as having a mapped pair, but no corresponding pair is found in the bam file. \n(Example Orphaned Read Name: "+nextName+")");

diff --git a/src/HartleyUtils/src/main/scala/internalUtils/genomicAnnoUtils.scala b/src/HartleyUtils/src/main/scala/internalUtils/genomicAnnoUtils.scala
@@ -49,6 +49,14 @@ object genomicAnnoUtils {
         error("ERROR: EfficientGenomeSeqContainer: requested sequence for chromosome other than current chromosome!");
       }
 
+      if(start < 0){
+        if(end < 0){
+          return repString("N",end-start);
+        } else {
+          return repString("N",-start) + getSeqForInterval(chrom=chrom,start=0,end=end,truncate=truncate);
+        }
+      }
+
       if(start < bufferStart){
         reportln("ERROR: EfficientGenomeSeqContainer: Illegal request for sequence prior to current buffer limits!\n"+
               "   request: "+chrom+":"+start+"-"+end,"note");
@@ -158,35 +166,55 @@ object genomicAnnoUtils {
     //Implementation note: It is important that you never read from remainderIter without first emptying currentIter!
     // Otherwise scala attempts to store the currentIter values.
     // This even remains true if there are no external references to the attached currentIter.
+    def parseChromLine(line : String) : String = {
+      if(line.head != '>'){
+        error("genome fasta file \""+infile+"\" appears mal-formatted! Chromsome line does not start with '>'!:\nOffending line: \""+line+"\"")
+      }
+      line.tail.split("\\s+",2)(0)
+    }
+
     def switchToChrom(chrom : String){
         report("Switching to Chromosome: " + chrom + " ["+getDateAndTimeString+"] ... ","debug");
-        while(currentIter.hasNext) currentIter.next;
-        var iter = remainderIter.dropWhile(line => line != (">"+chrom));
-        if(! iter.hasNext){
-          iter = internalUtils.fileUtils.getLinesSmartUnzip(infile).dropWhile(line => line != (">"+chrom));
-          if(iter.hasNext){
-            reportln("Returning to start of genome FASTA file. NOTE: for optimal performance, sort the FASTA file so that the chromosomes appear in the same order as in the BAM files.","note");
-          } else {
-            error("FATAL ERROR: Cannot find chromosome \""+chrom+"\" in genome FASTA file!")
-          }
-        }
-        val iterPair = iter.drop(1).span(line => line.charAt(0) != '>');
-        currentIter = iterPair._1.map(_.toUpperCase());
-        remainderIter = iterPair._2;
         clearBuffer();
-        currChrom = chrom;
-        report("done ["+getDateAndTimeString+"]\n","debug");
+        switchToChrom(chrom,true);
+        report("    found chrom "+chrom+" ["+getDateAndTimeString+"]\n","debug");
+    }
+    def switchToChrom(chrom : String,circleBack : Boolean){
+          while(chrom != currChrom && reader.hasNext){
+            reportln("   Skipping chrom \""+currChrom+"\" in genome fasta...","debug");
+            skipWhile(reader)(line => line.charAt(0) != '>')
+            if(reader.hasNext){
+              currChrom = parseChromLine(reader.next);
+              if(chrom != currChrom) reportln("   Skipping chrom \""+currChrom+"\" in genome fasta...","debug");
+            }
+          }
+          if(circleBack && currChrom != chrom){
+             notice("Returning to start of genome FASTA file. NOTE: for optimal performance, sort the FASTA file so that the chromosomes appear in the same order as in the BAM files.","LOOPED_GENOME_FASTA",-1);
+             reader = internalUtils.fileUtils.getLinesSmartUnzip(infile).buffered
+             currChrom = parseChromLine(reader.next);
+             switchToChrom(chrom,false);
+          }
     }
-
-    var initialReader = internalUtils.fileUtils.getLinesSmartUnzip(infile);
-    currChrom = initialReader.next().substring(1);
+    var reader = internalUtils.fileUtils.getLinesSmartUnzip(infile).buffered;
+    if(! reader.hasNext){
+      error("genome fasta file \""+infile+"\" appears to be empty!")
+    }
+    if(reader.head.head != '>'){
+      error("genome fasta file \""+infile+"\" appears mal-formatted! First line does not start with '>'!:\nOffending line: \""+reader+"\"")
+    }
+    currChrom = reader.next().substring(1);
     chromList = List[String](currChrom);
-
-    var (currentIter,remainderIter) = initialReader.span(line => line.charAt(0) != '>');
-    currentIter = currentIter.map(_.toUpperCase());
+    val currentIter = new BufferedIterator[String]{
+      def hasNext = reader.hasNext && reader.head.charAt(0) != '>';
+      def next = reader.next.toUpperCase();
+      def head = reader.head;
+    }
+    //var (currentIter,remainderIter) = initialReader.span(line => line.charAt(0) != '>');
+    //currentIter = currentIter.map(_.toUpperCase());
     def currIter = currentIter;
   }
 
+
   class EfficientGenomeSeqContainer_FAList(infiles : Seq[String]) extends EfficientGenomeSeqContainer {
     val fastaMap : Map[String,String] = infiles.map(infile => {
       (internalUtils.fileUtils.getLinesSmartUnzip(infile).next().tail,infile)