All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.fastq.FastqToBam.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.fastq

import java.util
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.CommonsDef.PathToFastq
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.{ConsensusTags, Umis}
import com.fulcrumgenomics.util.SegmentType._
import com.fulcrumgenomics.util.{Io, ProgressLogger, ReadStructure}
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Iso8601Date
import htsjdk.samtools.{ReservedTagConstants, SAMFileHeader, SAMReadGroupRecord}

@clp(group=ClpGroups.Fastq, description=
  """
    |Generates an unmapped BAM (or SAM or CRAM) file from fastq files.  Takes in one or more fastq files (optionally
    |gzipped), each representing a different sequencing read (e.g. R1, R2, I1 or I2) and can use a set of read
    |structures to allocate bases in those reads to template reads, sample indices, unique molecular indices, or to
    |designate bases to be skipped over.
    |
    |Read structures are made up of `` pairs much like the CIGAR string in BAM files. Four kinds of
    |operators are recognized:
    |
    |1. `T` identifies a template read
    |2. `B` identifies a sample barcode read
    |3. `M` identifies a unique molecular index read
    |4. `S` identifies a set of bases that should be skipped or ignored
    |
    |The last `` pair may be specified using a `+` sign instead of number to denote "all remaining
    |bases". This is useful if, e.g., fastqs have been trimmed and contain reads of varying length.  For example
    |to convert a paired-end run with an index read and where the first 5 bases of R1 are a UMI and the second
    |five bases are monotemplate you might specify:
    |
    |```
    |--input r1.fq r2.fq i1.fq --read-structures 5M5S+T +T +B
    |```
    |
    |Alternative if you know your reads are of fixed length you could specify:
    |
    |```
    |--input r1.fq r2.fq i1.fq --read-structures 5M5S65T 75T 8B
    |```
    |
    |For more information on read structures see the
    |[Read Structure Wiki Page](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures)
    |
    |UMIs may be extracted from the read sequences, the read names, or both.  If `--extract-umis-from-read-names` is
    |specified, any UMIs present in the read names are extracted; read names are expected to be `:`-separated with
    |any UMIs present in the 8th field.  If this option is specified, the `--umi-qual-tag` option may not be used as
    |qualities are not available for UMIs in the read name. If UMI segments are present in the read structures those
    |will also be extracted.  If UMIs are present in both, the final UMIs are constructed by first taking the UMIs
    |from the read names, then adding a hyphen, then the UMIs extracted from the reads.
    |
    |The same number of input files and read structures must be provided, with one exception: if supplying exactly
    |1 or 2 fastq files, both of which are solely template reads, no read structures need be provided.
    |
    |The output file can be sorted by queryname using the `--sort-order` option; the default is to produce a BAM
    |with reads in the same order as they appear in the fastq file.
  """)
class FastqToBam
(
  @arg(flag='i', doc="Fastq files corresponding to each sequencing read (e.g. R1, I1, etc.).") val input: Seq[PathToFastq],
  @arg(flag='o', doc="The output SAM or BAM file to be written.")                              val output: PathToBam,
  @arg(flag='r', doc="Read structures, one for each of the FASTQs.", minElements=0)            val readStructures: Seq[ReadStructure] = Seq(),
  @arg(flag='s', doc="If true, queryname sort the BAM file, otherwise preserve input order.")  val sort: Boolean = false,
  @arg(flag='u', doc="Tag in which to store molecular barcodes/UMIs.")                         val umiTag: String = ConsensusTags.UmiBases,
  @arg(flag='q', doc="Tag in which to store molecular barcode/UMI qualities.")                 val umiQualTag: Option[String] = None,
  @arg(flag='Q', doc="Store the sample barcode qualities in the QT Tag.")                      val storeSampleBarcodeQualities: Boolean = false,
  @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.")         val extractUmisFromReadNames: Boolean = false,
  @arg(          doc="Read group ID to use in the file header.")                               val readGroupId: String = "A",
  @arg(          doc="The name of the sequenced sample.")                                      val sample: String,
  @arg(          doc="The name/ID of the sequenced library.")                                  val library: String,
  @arg(flag='b', doc="Library or Sample barcode sequence.")                                    val barcode: Option[String] = None,
  @arg(          doc="Sequencing Platform.")                                                   val platform: String = "illumina",
  @arg(doc="Platform unit (e.g. '..')")                val platformUnit: Option[String] = None,
  @arg(doc="Platform model to insert into the group header (ex. miseq, hiseq2500, hiseqX)")    val platformModel: Option[String] = None,
  @arg(doc="The sequencing center from which the data originated")                             val sequencingCenter: Option[String] = None,
  @arg(doc="Predicted median insert size, to insert into the read group header")               val predictedInsertSize: Option[Integer] = None,
  @arg(doc="Description of the read group.")                                                   val description: Option[String] = None,
  @arg(doc="Comment(s) to include in the output file's header.", minElements = 0)              val comment: List[String] = Nil,
  @arg(doc="Date the run was produced, to insert into the read group header")                  val runDate: Option[Iso8601Date] = None
)
  extends FgBioTool with LazyLogging {

  // If no read structures are provided and we only have 1-2 fastqs, assume that they are just template reads
  private val actualReadStructures = if (readStructures.isEmpty && (1 to 2 contains input.length)) input.map(_ => ReadStructure("+T")) else readStructures

  Io.assertReadable(input)
  Io.assertCanWriteFile(output)
  validate(input.length == actualReadStructures.length, "input and read-structure must be supplied the same number of times.")
  validate(1 to 2 contains actualReadStructures.flatMap(_.templateSegments).size, "read structures must contain 1-2 template reads total.")
  validate(!extractUmisFromReadNames || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read names.")

  override def execute(): Unit = {
    val encoding = qualityEncoding
    val writer   = makeSamWriter()
    val iterator = FastqSource.zipped(this.input.map(FastqSource(_)))

    iterator.foreach { fqs => writer ++= makeSamRecords(fqs, actualReadStructures, writer.header, encoding) }
    writer.close()
  }

  /** Makes the SAMFileWriter we'll use to output the file. */
  protected def makeSamWriter(): SamWriter = {
    val header = new SAMFileHeader
    header.setSortOrder(if (sort) SortOrder.queryname else SortOrder.unsorted)
    header.setGroupOrder(GroupOrder.query)
    header.setComments(util.Arrays.asList(this.comment:_*))

    val rg = new SAMReadGroupRecord(this.readGroupId)
    rg.setSample(sample)
    rg.setLibrary(library)
    this.barcode.foreach(bc => rg.setBarcodes(util.Arrays.asList(bc)))
    rg.setPlatform(this.platform)
    this.platformUnit.foreach(pu => rg.setPlatformUnit(pu))
    this.sequencingCenter.foreach(cn => rg.setSequencingCenter(cn))
    this.predictedInsertSize.foreach(isize => rg.setPredictedMedianInsertSize(isize))
    this.platformModel.foreach(pm => rg.setPlatformModel(pm))
    this.runDate.foreach(date => rg.setRunDate(date))
    this.description.foreach(desc => rg.setDescription(desc))
    header.addReadGroup(rg)

    SamWriter(this.output, header, sort = if (sort) Some(SamOrder.Queryname) else None)
  }

  /** Generates SamRecords for each of the template reads across the read structures. */
  protected def makeSamRecords(fqs: Seq[FastqRecord],
                               structures: Seq[ReadStructure],
                               header: SAMFileHeader,
                               encoding: QualityEncoding
                              ): Seq[SamRecord] = {
    // Make the SamRecords inside a try so we can provide more informative error messages
    try {
      val subs = fqs.iterator.zip(structures.iterator).flatMap { case(fq, rs) => rs.extract(fq.bases, fq.quals) }.toIndexedSeq
      val sampleBarcode = subs.iterator.filter(_.kind == SampleBarcode).map(_.bases).mkString("-")
      val sampleQuals   = subs.iterator.filter(_.kind == SampleBarcode).map(_.quals).mkString(" ")
      val umi           = subs.iterator.filter(_.kind == MolecularBarcode).map(_.bases).mkString("-")
      val umiQual       = subs.iterator.filter(_.kind == MolecularBarcode).map(_.quals).mkString(" ")
      val templates     = subs.iterator.filter(_.kind == Template).toList

      // If requested, pull out the UMI(s) from the read name
      val umiFromReadName = if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) else None

      templates.zipWithIndex.map { case (read, index) =>
        // If the template read had no bases, we'll substitute in a single N @ Q2 below to keep htsjdk happy
        val empty = read.bases.isEmpty

        val rec = SamRecord(header)
        rec(ReservedTagConstants.READ_GROUP_ID) = this.readGroupId
        rec.name  = fqs.head.name
        rec.bases = if (empty) "N" else read.bases
        rec.quals = if (empty) Array[Byte](2) else encoding.toStandardNumeric(read.quals)
        rec.unmapped = true
        if (templates.size == 2) {
          rec.paired = true
          rec.mateUnmapped = true
          if (index == 0) rec.firstOfPair = true else rec.secondOfPair = true
        }

        if (sampleBarcode.nonEmpty) rec("BC") = sampleBarcode
        if (storeSampleBarcodeQualities && sampleQuals.nonEmpty) rec("QT") = sampleQuals

        // Set the UMI on the read depending on whether we got UMIs from the read names, reads or both
        (umi, umiFromReadName) match {
          case ("",       Some(fromName)) => rec(this.umiTag) = fromName
          case ("",       None          ) => ()
          case (fromRead, Some(fromName)) => rec(this.umiTag) = s"${fromName}-${fromRead}"
          case (fromRead, None          ) =>
            rec(this.umiTag) = fromRead
            this.umiQualTag.foreach(rec(_) = umiQual)
        }

        rec
      }
    }
    catch {
      case ex: Exception => fail(s"Failed to process record(s) with name ${fqs.map(_.name).head} due to: ${ex.getMessage}")
    }
  }

  /** Determine the quality encoding of the incoming fastq files. */
  protected def qualityEncoding: QualityEncoding = {
    val readers  = input.map { fastq => FastqSource(fastq) }
    val iterator = readers.tail.foldLeft(readers.head.iterator) {(a,b) => a ++ b }.map(_.quals)
    val detector = new QualityEncodingDetector
    detector.sample(iterator)
    readers.foreach(_.safelyClose())
    detector.rankedCompatibleEncodings(q=30) match {
      case Nil        => fail("Quality scores in FASTQ file do not match any known encoding.")
      case enc :: Nil => yieldAndThen(enc) { logger.info("Detected fastq quality encoding: ", enc) }
      case enc :: xs  =>
        logger.info(s"Could not uniquely determine quality encoding. Using $enc, other valid encodings: ${xs.mkString(", ")}")
        enc
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy