All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.util.PickIlluminaIndices.scala Maven / Gradle / Ivy

The newest version!
package com.fulcrumgenomics.util

import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.FgBioDef._
import scala.jdk.CollectionConverters._
import com.fulcrumgenomics.sopt.{arg, clp}

/**
  * Program for picking sets of indices of arbitrary length that meet certain constraints
  * and attempt to maximize the edit distance between all members of the set picked.
  *
  * @author Tim Fennell
  */
@clp(description =  
  """
    |Picks a set of molecular indices that should work well together.
    |
    |The indexes to evaluate are generated randomly, unless the `--candidates` option is given.  The latter
    |specifies a path to a file with one index per line from which indices are picked.
  """,
  group = ClpGroups.Utilities)
class PickIlluminaIndices
(
  @arg(flag='l', doc="The length of each barcode sequence.")                           val length: Int = 8,
  @arg(flag='n', doc="The number of indices desired.")                                 val indices: Int,
  @arg(flag='e', doc="The minimum edit distance between two indices in the set.")      val editDistance: Int = 3,
  @arg(flag='o', doc="File to write indices to.")                                      val output: FilePath,
  @arg(          doc="Allow indices that are lexical reverses of one another")         val allowReverses: Boolean = false,
  @arg(          doc="Allow indices that are reverse complements of one another")      val allowReverseComplements: Boolean = false,
  @arg(          doc="Allow indices that are palindromic (`bases == rev(bases)`).")    val allowPalindromes: Boolean = false,
  @arg(          doc="Reject indices with a homopolymer of greater than this length.") val maxHomopolymer: Int = 2,
  @arg(          doc="The minimum GC fraction for a barcode to be accepted.")          val minGc: Double = 0,
  @arg(          doc="The maximum GC fraction for a barcode to be accepted.")          val maxGc: Double = 0.7,
  @arg(flag='t', doc="Number of threads to use.")                                      val threads: Int = 4,
  @arg(          doc="The installation directory for `ViennaRNA`.")                    val viennaRnaDir: Option[DirPath] = None,
  @arg(          doc="The lowest acceptable secondary structure `deltaG`.")            val minDeltaG: Double = -10,
  @arg(          doc="The indexed adapter sequence into which the indices will be integrated.")
  val adapters: Seq[String] = IlluminaAdapters.DualIndexed.both,
  @arg(          doc="Sequences that should be avoided.  Any kmer of `length` that appears in these sequences and their reverse complements will be thrown out.")
  val avoidSequence: Seq[String] = IlluminaAdapters.all.flatMap(_.both),
  @arg(          doc="The candidate indices from which to choose with one index per line (nothing else), otherwise generate all possible indices")
  val candidates: Option[FilePath] = None
) extends FgBioTool{

  override def execute(): Unit = {
    // Get logging messages from PickIlluminaIndicesCommand
    htsjdk.samtools.util.Log.setGlobalLogLevel(htsjdk.samtools.util.Log.LogLevel.INFO)

    // Read in existing candidates if desired
    val candidates: Seq[Array[Byte]] = this.candidates.map { path =>
      Io.readLines(path).map(_.trim.toUpperCase).filter(_.nonEmpty).map(_.getBytes).toSeq
    }.getOrElse(Seq.empty)
    candidates.foreach { candidate =>
      require(candidate.length == this.length, s"Candidate length was ${candidate.length} but expected $length: ${new String(candidate)}")
    }

    val cmd = new PickIlluminaIndicesCommand()
    cmd.INDEX_LENGTH              = length
    cmd.N_INDICES                 = indices
    cmd.EDIT_DISTANCE             = editDistance
    cmd.ALLOW_REVERSES            = allowReverses
    cmd.ALLOW_REVERSE_COMPLEMENTS = allowReverseComplements
    cmd.ALLOW_PALINDROMES         = allowPalindromes
    cmd.MAX_HOMOPOLYMER           = maxHomopolymer
    cmd.MIN_GC                    = minGc
    cmd.MAX_GC                    = maxGc
    cmd.OUTPUT                    = output.toFile
    cmd.NUM_THREADS               = threads
    cmd.VIENNA_RNA_DIR            = viennaRnaDir.map(_.toFile).orNull
    cmd.MIN_DELTAG                = minDeltaG
    cmd.INDEX_ADAPTER             = adapters.asJava
    cmd.AVOID_SEQUENCE            = avoidSequence.asJava
    cmd.CANDIDATES                = candidates.asJava
    cmd.execute()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy