All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.fastq.TrimFastq.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package com.fulcrumgenomics.fastq

import com.fulcrumgenomics.cmdline.{FgBioTool, ClpGroups}
import com.fulcrumgenomics.util.ProgressLogger
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt._

@clp(
  description =
    """
      |Trims reads in one or more line-matched fastq files to a specific read length. The
      |individual fastq files are expected to have the same set of reads, as would be the
      |case with an `r1.fastq` and `r2.fastq` file for the same sample.
      |
      |Optionally supports dropping of reads across all files when one or more reads
      |is already shorter than the desired trim length.
      |
      |Input and output fastq files may be gzipped.
    """,
  group=ClpGroups.Fastq
)
class TrimFastq
(   @arg(flag='i', doc="One or more input fastq files.") val input:  Seq[PathToFastq],
    @arg(flag='o', doc="A matching number of output fastq files.") val output: Seq[PathToFastq],
    @arg(flag='l', doc="Length to trim reads to (either one per input fastq file, or one for all).") val length: Seq[Int],
    @arg(flag='x', doc="Exclude reads below the trim length.") val exclude: Boolean = false
) extends FgBioTool with LazyLogging {

  validate(input.size == output.size, "Number of input and output files must match.")
  validate(length.size == 1 || input.size == length.size, "Number of lengths must be one or match the number of input files.")

  override def execute(): Unit = {
    var discarded: Long = 0
    val progress = new ProgressLogger(this.logger, noun="records", verb="Wrote")

    val lengths = if (this.length.size == 1) List.fill(this.input.size)(this.length.head) else this.length
    val sources = input.map(FastqSource(_))
    val writers = output.map(FastqWriter(_))
    while (allHaveNext(sources)) {
      val recs = sources.map(_.next())
      if (exclude && recs.zip(lengths).exists { case (rec, length) => rec.length < length }) {
        discarded += 1
      }
      else {
        writers.lazyZip(recs).lazyZip(lengths).foreach { case (w: FastqWriter, r: FastqRecord, l: Int) =>
          w.write(r.trimmedTo(l))
          progress.record()
        }
      }
    }

    writers.foreach(_.close())
    logger.info(s"Wrote ${progress.getCount} records and discarded ${discarded}.")
  }

  /** Checks to see that all the input sources have another record, or that none do. */
  private def allHaveNext(xs: Seq[Iterator[_ <: Any]]): Boolean = {
    xs.map(_.hasNext).distinct match {
      case Seq(true)  => true
      case Seq(false) => false
      case _          => throw new IllegalStateException("Fastq files did not contain same number of records.")
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy