
com.fulcrumgenomics.bam.AutoGenerateReadGroupsByName.scala Maven / Gradle / Ivy
The newest version!
/*
* The MIT License
*
* Copyright (c) 2016 Fulcrum Genomics LLC
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
package com.fulcrumgenomics.bam
import java.util.concurrent.atomic.AtomicInteger
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.{Io, ProgressLogger}
import htsjdk.samtools._
import htsjdk.samtools.util.Iso8601Date
import scala.collection.mutable
@clp(group = ClpGroups.SamOrBam, description =
"""
|Adds read groups to a BAM file for a single sample by parsing the read names.
|
|Will add one or more read groups by parsing the read names. The read names should be of the form:
|
|```
|::::::
|```
|
|Each unique combination of `:::` will be its own read group. The ID of the
|read group will be an integer and the platform unit will be `.`.
|
|The input is assumed to contain reads for one sample and library. Therefore, the sample and library must be given
|and will be applied to all read groups. Read groups will be replaced if present.
|
|Two passes will be performed on the input: first to gather all the read groups, and second to write the output BAM
|file.
"""
)
class AutoGenerateReadGroupsByName
(@arg(flag='i', doc="Input SAM or BAM file") val input: PathToBam,
@arg(flag='o', doc="Output SAM or BAM file") val output: PathToBam,
@arg(flag='s', doc="The sample to insert into the read groups") val sample: String,
@arg(flag='l', doc="The library to insert into the read groups") val library: String,
@arg(doc="The sequencing center from which the data originated") val sequencingCenter: Option[String] = None,
@arg(doc="Predicted median insert size, to insert into the read groups") val predictedInsertSize: Option[Integer] = None,
@arg(doc="Program group to insert into the read groups") val programGroup: Option[String] = None,
@arg(doc="Platform model to insert into the groups (free-form text providing further details of the platform/technology used)") val platformModel: Option[String] = None,
@arg(doc="Description inserted into the read groups") val description: Option[String] = None,
@arg(doc="Date the run was produced (ISO 8601: `YYYY-MM-DD` ), to insert into the read groups") val runDate: Option[Iso8601Date] = None,
@arg(doc="Comment(s) to include in the merged output file's header.", minElements = 0) val comments: List[String] = Nil,
@arg(doc="The sort order for the output sam/bam file.") val sortOrder: Option[SamOrder] = None
) extends FgBioTool with LazyLogging {
Io.assertReadable(input)
Io.assertCanWriteFile(output)
private val nextId = new AtomicInteger(0)
private val readGroups = new mutable.HashMap[RunInfoFromRead, SAMReadGroupRecord]()
override def execute(): Unit = {
// Gather all the read groups
{
val progress = ProgressLogger(logger, verb = "read", unit = 5e6.toInt)
val in = SamSource(input)
in.foreach { record =>
val runInfo = RunInfoFromRead(name=record.name)
if (!readGroups.contains(runInfo)) {
val readGroup = new SAMReadGroupRecord(nextId.incrementAndGet().toString)
readGroup.setSample(sample)
readGroup.setLibrary(library)
readGroup.setPlatformUnit(runInfo.platformUnit)
platformModel.foreach(readGroup.setPlatformModel)
readGroup.setPlatform("ILLUMINA")
sequencingCenter.foreach(readGroup.setSequencingCenter)
predictedInsertSize.foreach(readGroup.setPredictedMedianInsertSize)
programGroup.foreach(readGroup.setProgramGroup)
description.foreach(readGroup.setDescription)
runDate.foreach(readGroup.setRunDate)
readGroups(runInfo) = readGroup
}
progress.record(record)
}
in.safelyClose()
}
// Write them all out
{
val in = SamSource(input)
val outHeader = in.header.clone()
outHeader.setReadGroups(java.util.Arrays.asList(readGroups.values.toSeq:_*))
comments.foreach(outHeader.addComment)
val out = SamWriter(output, outHeader, sort=sortOrder)
in.foreach { record =>
val runInfo = RunInfoFromRead(name = record.name)
val readGroupId = readGroups(runInfo).getReadGroupId
record(SAMTag.RG.name()) = readGroupId
out += record
}
in.safelyClose()
out.close()
}
}
}
/** Stores parsed run-level information about a read. */
private[bam] case class RunInfoFromRead(instrument: String, runNumber: String, flowcellId: String, lane: Int)
{
/** The platform unit: */
val platformUnit: String = s"$flowcellId.$lane"
}
private[bam] object RunInfoFromRead {
/** Parse a read name to get run-level information. */
def apply(name: String): RunInfoFromRead = {
val tokens = name.split(':').toSeq
if (tokens.length != 7) throw new IllegalArgumentException(s"Expected seven colon-delimited fields for read with name: $name")
new RunInfoFromRead(
instrument = tokens(0),
runNumber = tokens(1),
flowcellId = tokens(2),
lane = tokens(3).toInt
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy