All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.util.BaitDesigner Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
package picard.util;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.None;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Pattern;

/**
 * Designs baits for hybrid selection!
 *
 * @author Tim Fennell
 */
@CommandLineProgramProperties(
                usage = BaitDesigner.USAGE_SUMMARY + BaitDesigner.USAGE_DETAILS,
                usageShort = BaitDesigner.USAGE_SUMMARY,
                programGroup = None.class
        )
public class BaitDesigner extends CommandLineProgram {
static final String USAGE_SUMMARY = "Designs oligonucleotide baits for hybrid selection reactions.";
static final String USAGE_DETAILS = "

This tool is used to design custom bait sets for hybrid selection experiments. The following " + "files are input into BaitDesigner: a (TARGET) interval list indicating the sequences of interest, e.g. exons with their " + "respective coordinates, a reference sequence, and a unique identifier string (DESIGN_NAME).

" + "

The tool will output interval_list files of both bait and target sequences as well as the actual bait sequences in " + "FastA format. At least two baits are output for each target sequence, with greater numbers for larger intervals. Although " + "the default values for both bait size (120 bases) nd offsets (80 bases) are suitable for most applications, these values can " + "be customized. Offsets represent the distance between sequential baits on a contiguous stretch of target DNA sequence.

"+ "

The tool will also output a pooled set of 55,000 (default) oligonucleotides representing all of the baits redundantly. " + "This redundancy achieves a uniform concentration of oligonucleotides for synthesis by a vendor as well as equal numbers" + "of each bait to prevent bias during the hybrid selection reaction.

" + "

Usage example:

" + "
" +
        "java -jar picard.jar BaitDesigner \\
" + " TARGET=targets.interval_list \\
" + " DESIGN_NAME=new_baits \\
" + " R=reference_sequence.fasta " + "
" + "
"; /** * Subclass of Interval for representing Baits, that caches the bait sequence. */ static class Bait extends Interval { byte[] bases; /** Pass through constructor. */ public Bait(final String sequence, final int start, final int end, final boolean negative, final String name) { super(sequence, start, end, negative, name); } /** Method that takes in the reference sequence this bait is on and caches the bait's bases. */ public void addBases(final ReferenceSequence reference, final boolean useStrandInfo) { final byte[] tmp = new byte[length()]; System.arraycopy(reference.getBases(), getStart() - 1, tmp, 0, length()); if (useStrandInfo && isNegativeStrand()) { SequenceUtil.reverseComplement(tmp); } setBases(tmp); } public int getMaskedBaseCount() { return BaitDesigner.getMaskedBaseCount(bases, 0, bases.length); } @Override public String toString() { return "Bait{" + "name=" + getName() + ", bases=" + StringUtil.bytesToString(bases) + '}'; } public void setBases(final byte[] bases) { this.bases = bases;} public byte[] getBases() { return bases; } } /** * Set of possible design strategies for bait design. */ public enum DesignStrategy { /** Implementation that "constrains" baits to be within the target region when possible. */ CenteredConstrained { List design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) { final List baits = new LinkedList(); final int baitSize = designer.BAIT_SIZE; final int baitOffset = designer.BAIT_OFFSET; // Treat targets less than a bait length differently if (target.length() <= baitSize) { final int midpoint = target.getStart() + (target.length() / 2); final int baitStart = midpoint - (baitSize / 2); final Bait bait = new Bait(target.getContig(), baitStart, CoordMath.getEnd(baitStart, baitSize), target.isNegativeStrand(), designer.makeBaitName(target.getName(), 1, 1)); bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND); baits.add(bait); } else { // Work out how many baits we need and how to space them final int baitCount = 1 + (int) Math.ceil((target.length() - baitSize) / (double) baitOffset); final int firstBaitStart = target.getStart(); final int lastBaitStart = CoordMath.getStart(target.getEnd(), baitSize); final double actualShift = (lastBaitStart - firstBaitStart) / (double) (baitCount - 1); // And then design them int baitIndex = 1; int start = firstBaitStart; while (start <= lastBaitStart) { final int end = CoordMath.getEnd(start, baitSize); final Bait bait = new Bait(target.getContig(), start, end, target.isNegativeStrand(), designer.makeBaitName(target.getName(), baitIndex, baitCount)); bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND); baits.add(bait); // Recalculate from actualShift to avoid compounding rounding errors start = firstBaitStart + (int) Math.round(actualShift * baitIndex); ++baitIndex; } } return baits; } }, /** * Design that places baits at fixed offsets over targets, allowing them to hang off the ends * as dictated by the target size and offset. */ FixedOffset { List design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) { final List baits = new LinkedList(); final int baitSize = designer.BAIT_SIZE; final int baitOffset = designer.BAIT_OFFSET; final int minTargetSize = baitSize + (baitOffset * (designer.MINIMUM_BAITS_PER_TARGET - 1)); // Redefine the target to be the size of a bait, or the size that a number of baits // tiled tiles across the target at the fixed offset final Interval t2; if (target.length() < minTargetSize) { final int addon = minTargetSize - target.length(); final int left = addon / 2; final int right = addon - left; t2 = new Interval(target.getContig(), Math.max(target.getStart() - left, 1), Math.min(target.getEnd() + right, reference.length()), target.isNegativeStrand(), target.getName()); } else { t2 = target; } // Work out how many baits we need and how to space them final int baitCount = 1 + (int) Math.ceil((t2.length() - baitSize) / (double) baitOffset); final int baitedBases = baitSize + (baitOffset * (baitCount - 1)); final int firstBaitStart = Math.max(t2.getStart() - ((baitedBases - t2.length()) / 2), 1); // And then design them final byte[] bases = reference.getBases(); final int MAX_MASKED = designer.REPEAT_TOLERANCE; for (int i = 1; i <= baitCount; ++i) { int start = firstBaitStart + (baitOffset * (i - 1)); int end = CoordMath.getEnd(start, baitSize); if (end > reference.length()) break; // If there are too many soft masked bases try shifting it around if (getMaskedBaseCount(bases, start - 1, end) > MAX_MASKED) { final int maxMove = baitOffset * 3 / 4; for (int move = 1; move <= maxMove; move++) { // Move it "backwards?" if (start - move >= 1 && getMaskedBaseCount(bases, start - move - 1, end - move) <= MAX_MASKED) { start = start - move; end = end - move; break; } // Move it "forwards"? if (end + move <= reference.length() && getMaskedBaseCount(bases, start + move - 1, end + move) <= MAX_MASKED) { start = start + move; end = end + move; break; } } } final Bait bait = new Bait(t2.getContig(), start, end, t2.isNegativeStrand(), designer.makeBaitName(t2.getName(), i, baitCount)); bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND); baits.add(bait); } return baits; } }, /** * Ultra simple bait design algorithm that just lays down baits starting at the target start position * until either the bait start runs off the end of the target or the bait would run off the sequence */ Simple { @Override List design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) { final List baits = new LinkedList(); final int baitSize = designer.BAIT_SIZE; final int baitOffset = designer.BAIT_OFFSET; final int lastPossibleBaitStart = Math.min(target.getEnd(), reference.length() - baitSize); final int baitCount = 1 + (int) Math.floor((lastPossibleBaitStart - target.getStart()) / (double) baitOffset); int i = 0; for (int start = target.getStart(); start < lastPossibleBaitStart; start += baitOffset) { final Bait bait = new Bait(target.getContig(), start, CoordMath.getEnd(start, baitSize), target.isNegativeStrand(), designer.makeBaitName(target.getName(), ++i, baitCount)); bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND); baits.add(bait); } return baits; } }; /** Design method that each Design Strategy must implement. */ abstract List design(BaitDesigner designer, Interval target, ReferenceSequence reference); } /////////////////////////////////////////////////////////////////////////// // Options for the Bait Designer /////////////////////////////////////////////////////////////////////////// @Option(shortName = "T", doc = "The file with design parameters and targets") public File TARGETS; @Option(doc = "The name of the bait design") public String DESIGN_NAME; @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "The reference sequence fasta file") public File REFERENCE_SEQUENCE; @Option(doc = "The left amplification primer to prepend to all baits for synthesis") public String LEFT_PRIMER = "ATCGCACCAGCGTGT"; @Option(doc = "The right amplification primer to prepend to all baits for synthesis") public String RIGHT_PRIMER = "CACTGCGGCTCCTCA"; @Option(doc = "The design strategy to use to layout baits across each target") public DesignStrategy DESIGN_STRATEGY = DesignStrategy.FixedOffset; @Option(doc = "The length of each individual bait to design") public int BAIT_SIZE = 120; @Option(doc = "The minimum number of baits to design per target.") public int MINIMUM_BAITS_PER_TARGET = 2; @Option(doc = "The desired offset between the start of one bait and the start of another bait for the same target.") public int BAIT_OFFSET = 80; @Option(doc = "Pad the input targets by this amount when designing baits. Padding is applied on both sides in this amount.") public int PADDING = 0; @Option(doc = "Baits that have more than REPEAT_TOLERANCE soft or hard masked bases will not be allowed") public int REPEAT_TOLERANCE = 50; @Option(doc = "The size of pools or arrays for synthesis. If no pool files are desired, can be set to 0.") public int POOL_SIZE = 55000; @Option(doc = "If true, fill up the pools with alternating fwd and rc copies of all baits. Equal copies of " + "all baits will always be maintained") public boolean FILL_POOLS = true; @Option(doc = "If true design baits on the strand of the target feature, if false always design on the + strand of " + "the genome.") public boolean DESIGN_ON_TARGET_STRAND = false; @Option(doc = "If true merge targets that are 'close enough' that designing against a merged target would be more efficient.") public boolean MERGE_NEARBY_TARGETS = true; @Option(doc = "If true also output .design.txt files per pool with one line per bait sequence") public boolean OUTPUT_AGILENT_FILES = true; @Option(shortName = "O", optional = true, doc = "The output directory. If not provided then the DESIGN_NAME will be used as the output directory") public File OUTPUT_DIRECTORY; // "Output" members that will also get picked up by writeParametersFile() int TARGET_TERRITORY; int TARGET_COUNT; int BAIT_TERRITORY; int BAIT_COUNT; int BAIT_TARGET_TERRITORY_INTERSECTION; int ZERO_BAIT_TARGETS; double DESIGN_EFFICIENCY; // Utility objects private static final Log log = Log.getInstance(BaitDesigner.class); private final NumberFormat fmt = NumberFormat.getIntegerInstance(); /** Takes a target name and a bait index and creates a uniform bait name. */ String makeBaitName(final String targetName, final int baitIndex, final int totalBaits) { final String total = fmt.format(totalBaits); String bait = fmt.format(baitIndex); // Pad out the bait number to match the longest one for this target while (bait.length() < total.length()) bait = "0" + bait; return targetName + "_bait#" + bait; } /** Returns the total of soft or hard masked bases in the interval of bases. */ public static int getMaskedBaseCount(final byte[] bases, final int from, final int until) { int count = 0; for (int i = from; i < until; i++) { final byte b = bases[i]; if (b != 'A' && b != 'C' && b != 'G' && b != 'T') ++count; } return count; } /** Stock main method. */ public static void main(final String[] args) { new BaitDesigner().instanceMainWithExit(args); } @Override protected String[] customCommandLineValidation() { final List errors = new ArrayList(); final Pattern p = Pattern.compile("^[ACGTacgt]*$"); if (LEFT_PRIMER != null && !p.matcher(LEFT_PRIMER).matches()) { errors.add("Left primer " + LEFT_PRIMER + " is not a valid primer sequence."); } if (RIGHT_PRIMER != null && !p.matcher(RIGHT_PRIMER).matches()) { errors.add("Right primer " + RIGHT_PRIMER + " is not a valid primer sequence."); } if (!errors.isEmpty()) return errors.toArray(new String[errors.size()]); else return null; } int estimateBaits(final int start, final int end) { final int length = end - start + 1; return Math.max(MINIMUM_BAITS_PER_TARGET, (int) (Math.ceil(length - BAIT_SIZE) / (double) BAIT_OFFSET) + 1); } /** * Main method that coordinates the checking of inputs, designing of baits and then * the writing out of all necessary files. * * @return */ @Override protected int doWork() { // Input parameter munging and checking if (OUTPUT_DIRECTORY == null) OUTPUT_DIRECTORY = new File(DESIGN_NAME); IOUtil.assertFileIsReadable(TARGETS); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); if (!OUTPUT_DIRECTORY.exists()) { OUTPUT_DIRECTORY.mkdirs(); } IOUtil.assertDirectoryIsWritable(OUTPUT_DIRECTORY); // Load up the targets and the reference final IntervalList targets; final IntervalList originalTargets = IntervalList.fromFile(TARGETS); { // Apply padding final IntervalList padded = new IntervalList(originalTargets.getHeader()); final SAMSequenceDictionary dict = padded.getHeader().getSequenceDictionary(); for (final Interval i : originalTargets.getIntervals()) { padded.add(new Interval(i.getContig(), Math.max(i.getStart() - PADDING, 1), Math.min(i.getEnd() + PADDING, dict.getSequence(i.getContig()).getSequenceLength()), i.isNegativeStrand(), i.getName())); } log.info("Starting with " + padded.size() + " targets."); padded.uniqued(); log.info("After uniquing " + padded.size() + " targets remain."); if (MERGE_NEARBY_TARGETS) { final ListIterator iterator = padded.getIntervals().listIterator(); Interval previous = iterator.next(); targets = new IntervalList(padded.getHeader()); while (iterator.hasNext()) { final Interval next = iterator.next(); if (previous.getContig().equals(next.getContig()) && estimateBaits(previous.getStart(), previous.getEnd()) + estimateBaits(next.getStart(), next.getEnd()) >= estimateBaits(previous.getStart(), next.getEnd())) { previous = new Interval(previous.getContig(), previous.getStart(), Math.max(previous.getEnd(), next.getEnd()), previous.isNegativeStrand(), previous.getName()); } else { targets.add(previous); previous = next; } } if (previous != null) targets.add(previous); log.info("After collapsing nearby targets " + targets.size() + " targets remain."); } else { targets = padded; } } final ReferenceSequenceFileWalker referenceWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); // Check that the reference and the target list have matching headers SequenceUtil.assertSequenceDictionariesEqual(referenceWalker.getSequenceDictionary(), targets.getHeader().getSequenceDictionary()); // Design the baits! int discardedBaits = 0; final IntervalList baits = new IntervalList(targets.getHeader()); for (final Interval target : targets) { final int sequenceIndex = targets.getHeader().getSequenceIndex(target.getContig()); final ReferenceSequence reference = referenceWalker.get(sequenceIndex); for (final Bait bait : DESIGN_STRATEGY.design(this, target, reference)) { if (bait.length() != BAIT_SIZE) { throw new PicardException("Bait designed at wrong length: " + bait); } if (bait.getMaskedBaseCount() <= REPEAT_TOLERANCE) { baits.add(bait); for (final byte b : bait.getBases()) { final byte upper = StringUtil.toUpperCase(b); if (upper != 'A' && upper != 'C' && upper != 'G' && upper != 'T') { log.warn("Bait contains non-synthesizable bases: " + bait); } } } else { log.debug("Discarding bait: " + bait); discardedBaits++; } } } calculateStatistics(targets, baits); log.info("Designed and kept " + baits.size() + " baits, discarded " + discardedBaits); // Write out some files! originalTargets.write(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".targets.interval_list")); baits.write(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".baits.interval_list")); writeParametersFile(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".design_parameters.txt")); writeDesignFastaFile(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".design.fasta"), baits); if (POOL_SIZE > 0) writePoolFiles(OUTPUT_DIRECTORY, DESIGN_NAME, baits); return 0; } /** Calculates a few statistics about the bait design that can then be output. */ void calculateStatistics(final IntervalList targets, final IntervalList baits) { this.TARGET_TERRITORY = (int) targets.getUniqueBaseCount(); this.TARGET_COUNT = targets.size(); this.BAIT_TERRITORY = (int) baits.getUniqueBaseCount(); this.BAIT_COUNT = baits.size(); this.DESIGN_EFFICIENCY = this.TARGET_TERRITORY / (double) this.BAIT_TERRITORY; // Figure out the intersection between all targets and all baits final IntervalList tmp = new IntervalList(targets.getHeader()); final OverlapDetector detector = new OverlapDetector(0, 0); detector.addAll(baits.getIntervals(), baits.getIntervals()); for (final Interval target : targets) { final Collection overlaps = detector.getOverlaps(target); if (overlaps.isEmpty()) { this.ZERO_BAIT_TARGETS++; } else { for (final Interval i : overlaps) tmp.add(target.intersect(i)); } } tmp.uniqued(); this.BAIT_TARGET_TERRITORY_INTERSECTION = (int) tmp.getBaseCount(); } /** Method that writes out all the parameter values that were used in the design using reflection. */ void writeParametersFile(final File file) { try { final BufferedWriter out = IOUtil.openFileForBufferedWriting(file); for (final Field field : getClass().getDeclaredFields()) { if (Modifier.isPrivate(field.getModifiers())) continue; final String name = field.getName(); if (name.toUpperCase().equals(name) && !name.equals("USAGE")) { final Object value = field.get(this); if (value != null) { out.append(name); out.append("="); out.append(value.toString()); out.newLine(); } } } out.close(); } catch (Exception e) { throw new PicardException("Error writing out parameters file.", e); } } void writeDesignFastaFile(final File file, final IntervalList baits) { final BufferedWriter out = IOUtil.openFileForBufferedWriting(file); for (final Interval i : baits) { writeBaitFasta(out, i, false); } CloserUtil.close(out); } /** Writes a Bait out in fasta format to an output BufferedWriter. */ private void writeBaitFasta(final BufferedWriter out, final Interval i, final boolean rc) { try { final Bait bait = (Bait) i; out.append(">"); out.append(bait.getName()); out.newLine(); final String sequence = getBaitSequence(bait, rc); out.append(sequence); out.newLine(); } catch (IOException ioe) { throw new PicardException("Error writing out bait information.", ioe); } } /** Gets the bait sequence, with primers, as a String, RC'd as appropriate. */ private String getBaitSequence(final Bait bait, final boolean rc) { String sequence = (LEFT_PRIMER == null ? "" : LEFT_PRIMER) + StringUtil.bytesToString(bait.getBases()) + (RIGHT_PRIMER == null ? "" : RIGHT_PRIMER); if (rc) sequence = SequenceUtil.reverseComplement(sequence); return sequence; } /** * Writes out fasta files for each pool and also agilent format files if requested. * * @param dir the directory to output files into * @param basename the basename of each file * @param baits the set of baits to write out */ void writePoolFiles(final File dir, final String basename, final IntervalList baits) { final int copies; if (FILL_POOLS && baits.size() < POOL_SIZE) copies = (int) Math.floor(POOL_SIZE / (double) baits.size()); else copies = 1; int written = 0; int nextPool = 0; BufferedWriter out = null; BufferedWriter agilentOut = null; final String prefix = DESIGN_NAME.substring(0, Math.min(DESIGN_NAME.length(), 8)) + "_"; // prefix for 15 digit bait id final NumberFormat fmt = new DecimalFormat("000000"); try { for (int i = 0; i < copies; ++i) { final boolean rc = i % 2 == 1; int baitId = 1; for (final Interval interval : baits) { final Bait bait = (Bait) interval; if (written++ % POOL_SIZE == 0) { if (out != null) out.close(); if (agilentOut != null) agilentOut.close(); final String filename = basename + ".pool" + nextPool++ + ".design."; out = IOUtil.openFileForBufferedWriting(new File(dir, filename + "fasta")); if (OUTPUT_AGILENT_FILES) { agilentOut = IOUtil.openFileForBufferedWriting(new File(dir, filename + "txt")); } } writeBaitFasta(out, interval, rc); if (OUTPUT_AGILENT_FILES) { agilentOut.append(prefix).append(fmt.format(baitId++)); agilentOut.append("\t"); agilentOut.append(getBaitSequence(bait, rc).toUpperCase()); agilentOut.newLine(); } } } CloserUtil.close(out); CloserUtil.close(agilentOut); } catch (Exception e) { throw new PicardException("Error while writing pool files.", e); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy