All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.RandomizeBam.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

/**
  * Created by tfenne on 6/23/16.
  */

// import com.fulcrumgenomics.FgBioDef._


import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api._
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, ProgressLogger, Sorter}
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Murmur3

@clp(group=ClpGroups.SamOrBam, description=
"""
  |Randomizes the order of reads in a SAM or BAM file. Randomization is done by sorting
  |on a hash of the `queryname` (and bases and quals if not query-grouping). By default
  |reads with the same query name are grouped together in the output file; this can be
  |turned off by specifying --query-group=false.
""")
class RandomizeBam(
  @arg(flag='i', doc="The input SAM or BAM file.")           val input: PathToBam = Io.StdIn,
  @arg(flag='o', doc="The output SAM or BAM file.")          val output: PathToBam = Io.StdOut,
  @arg(flag='s', doc="Random seed.")                         val seed: Int = 42,
  @arg(flag='q', doc="Group together reads by queryname.")   val queryGroup: Boolean = true
) extends FgBioTool with LazyLogging {

  Io.assertCanWriteFile(input)
  Io.assertCanWriteFile(output)

  override def execute(): Unit = {
    val in = SamSource(input)
    val hasher = new Murmur3(seed)

    val sorter = {
      if (queryGroup) {
        val f = (r: SamRecord) => SamOrder.RandomQueryKey(hasher.hashUnencodedChars(r.name), r.name, r.asSam.getFlags)
        new Sorter(2e6.toInt, new SamRecordCodec(in.header), f)
      }
      else {
        val f = (r: SamRecord) => {
          val key = s"${r.name}:${r.basesString}:${r.qualsString}"
          SamOrder.RandomKey(hasher.hashUnencodedChars(key), r.asSam.getFlags)
        }
        new Sorter(2e6.toInt, new SamRecordCodec(in.header), f)
      }
    }

    logger.info("Sorting reads into random order.")
    val sortProgress = ProgressLogger(logger, verb="sorted", unit=5e6.toInt)
    in.foreach(rec => {
      sorter += rec
      sortProgress.record()
    })

    logger.info("Writing reads out to output.")
    val outHeader = in.header.clone()
    (if (queryGroup) SamOrder.RandomQuery else SamOrder.Random).applyTo(outHeader)
    outHeader.setSortOrder(SortOrder.unsorted)
    if (queryGroup) outHeader.setGroupOrder(GroupOrder.query)
    val out = SamWriter(output, outHeader)

    out ++= sorter
    out.close()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy