All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.fastq.SortFastq.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2019 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.fastq

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, Sorter}

/** Companion object for the SortFastq tool. */
object SortFastq {
  private class FastqCodec extends Sorter.Codec[FastqRecord] {
    /** Encodes the record directly to fastq. */
    override def encode(rec: FastqRecord): Array[Byte] = rec.toString.getBytes

    /** Decodes the record from the fastq string. */
    override def decode(bs: Array[Byte], start: Int, length: Int): FastqRecord = {
      val lines = new String(bs, start, length).linesIterator
      val name       = lines.next().substring(1)
      val bases      = lines.next()
      val qualHeader = lines.next()
      val quals      = lines.next()

      FastqRecord(name=name, bases=bases, quals=quals)
    }
  }

  private class SortKey(val name: String, val number: Byte) extends Ordered[SortKey] {
    override def compare(that: SortKey): Int = {
      var retval = this.name.compareTo(that.name)
      if (retval == 0) retval = this.number - that.number
      retval
    }
  }
}

@clp(group=ClpGroups.Fastq, description=
  """
    |Sorts a FASTQ file.  Sorts the records in a FASTQ file based on the lexicographic ordering
    |of their read names.  Input and output files can be either uncompressed or gzip-compressed.
  """)
class SortFastq
( @arg(flag='i', doc="Input fastq file.") input: PathToFastq,
  @arg(flag='o', doc="Output fastq file.") output: PathToFastq,
  @arg(flag='m', doc="Maximum records to keep in RAM at one time.") val maxRecordsInRam: Int = 5e5.toInt
) extends FgBioTool {
  import SortFastq._

  Io.assertReadable(input)
  Io.assertCanWriteFile(output)

  override def execute(): Unit = {
    val in = FastqSource(input)
    val sorter = new Sorter[FastqRecord, SortKey](
      maxObjectsInRam = maxRecordsInRam,
      codec           = new FastqCodec,
      keyfunc         = r => new SortKey(r.name, r.readNumber.getOrElse(1).toByte)
    )

    sorter ++= in
    in.safelyClose()

    val out = FastqWriter(output)
    out ++= sorter.iterator
    out.close()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy