picard.util.BaitDesigner Maven / Gradle / Ivy
package picard.util;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.None;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Pattern;
/**
* Designs baits for hybrid selection!
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = BaitDesigner.USAGE_SUMMARY + BaitDesigner.USAGE_DETAILS,
usageShort = BaitDesigner.USAGE_SUMMARY,
programGroup = None.class
)
public class BaitDesigner extends CommandLineProgram {
static final String USAGE_SUMMARY = "Designs oligonucleotide baits for hybrid selection reactions.";
static final String USAGE_DETAILS = "This tool is used to design custom bait sets for hybrid selection experiments. The following " +
"files are input into BaitDesigner: a (TARGET) interval list indicating the sequences of interest, e.g. exons with their " +
"respective coordinates, a reference sequence, and a unique identifier string (DESIGN_NAME).
" +
"The tool will output interval_list files of both bait and target sequences as well as the actual bait sequences in " +
"FastA format. At least two baits are output for each target sequence, with greater numbers for larger intervals. Although " +
"the default values for both bait size (120 bases) nd offsets (80 bases) are suitable for most applications, these values can " +
"be customized. Offsets represent the distance between sequential baits on a contiguous stretch of target DNA sequence.
"+
"The tool will also output a pooled set of 55,000 (default) oligonucleotides representing all of the baits redundantly. " +
"This redundancy achieves a uniform concentration of oligonucleotides for synthesis by a vendor as well as equal numbers" +
"of each bait to prevent bias during the hybrid selection reaction.
" +
"Usage example:
" +
"" +
"java -jar picard.jar BaitDesigner \\
" +
" TARGET=targets.interval_list \\
" +
" DESIGN_NAME=new_baits \\
" +
" R=reference_sequence.fasta " +
"
" +
"
";
/**
* Subclass of Interval for representing Baits, that caches the bait sequence.
*/
static class Bait extends Interval {
byte[] bases;
/** Pass through constructor. */
public Bait(final String sequence, final int start, final int end, final boolean negative, final String name) {
super(sequence, start, end, negative, name);
}
/** Method that takes in the reference sequence this bait is on and caches the bait's bases. */
public void addBases(final ReferenceSequence reference, final boolean useStrandInfo) {
final byte[] tmp = new byte[length()];
System.arraycopy(reference.getBases(), getStart() - 1, tmp, 0, length());
if (useStrandInfo && isNegativeStrand()) {
SequenceUtil.reverseComplement(tmp);
}
setBases(tmp);
}
public int getMaskedBaseCount() {
return BaitDesigner.getMaskedBaseCount(bases, 0, bases.length);
}
@Override
public String toString() {
return "Bait{" +
"name=" + getName() +
", bases=" + StringUtil.bytesToString(bases) +
'}';
}
public void setBases(final byte[] bases) { this.bases = bases;}
public byte[] getBases() { return bases; }
}
/**
* Set of possible design strategies for bait design.
*/
public enum DesignStrategy {
/** Implementation that "constrains" baits to be within the target region when possible. */
CenteredConstrained {
List design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) {
final List baits = new LinkedList();
final int baitSize = designer.BAIT_SIZE;
final int baitOffset = designer.BAIT_OFFSET;
// Treat targets less than a bait length differently
if (target.length() <= baitSize) {
final int midpoint = target.getStart() + (target.length() / 2);
final int baitStart = midpoint - (baitSize / 2);
final Bait bait = new Bait(target.getContig(),
baitStart,
CoordMath.getEnd(baitStart, baitSize),
target.isNegativeStrand(),
designer.makeBaitName(target.getName(), 1, 1));
bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND);
baits.add(bait);
} else {
// Work out how many baits we need and how to space them
final int baitCount = 1 + (int) Math.ceil((target.length() - baitSize) / (double) baitOffset);
final int firstBaitStart = target.getStart();
final int lastBaitStart = CoordMath.getStart(target.getEnd(), baitSize);
final double actualShift = (lastBaitStart - firstBaitStart) / (double) (baitCount - 1);
// And then design them
int baitIndex = 1;
int start = firstBaitStart;
while (start <= lastBaitStart) {
final int end = CoordMath.getEnd(start, baitSize);
final Bait bait = new Bait(target.getContig(),
start,
end,
target.isNegativeStrand(),
designer.makeBaitName(target.getName(), baitIndex, baitCount));
bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND);
baits.add(bait);
// Recalculate from actualShift to avoid compounding rounding errors
start = firstBaitStart + (int) Math.round(actualShift * baitIndex);
++baitIndex;
}
}
return baits;
}
},
/**
* Design that places baits at fixed offsets over targets, allowing them to hang off the ends
* as dictated by the target size and offset.
*/
FixedOffset {
List design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) {
final List baits = new LinkedList();
final int baitSize = designer.BAIT_SIZE;
final int baitOffset = designer.BAIT_OFFSET;
final int minTargetSize = baitSize + (baitOffset * (designer.MINIMUM_BAITS_PER_TARGET - 1));
// Redefine the target to be the size of a bait, or the size that a number of baits
// tiled tiles across the target at the fixed offset
final Interval t2;
if (target.length() < minTargetSize) {
final int addon = minTargetSize - target.length();
final int left = addon / 2;
final int right = addon - left;
t2 = new Interval(target.getContig(),
Math.max(target.getStart() - left, 1),
Math.min(target.getEnd() + right, reference.length()),
target.isNegativeStrand(),
target.getName());
} else {
t2 = target;
}
// Work out how many baits we need and how to space them
final int baitCount = 1 + (int) Math.ceil((t2.length() - baitSize) / (double) baitOffset);
final int baitedBases = baitSize + (baitOffset * (baitCount - 1));
final int firstBaitStart = Math.max(t2.getStart() - ((baitedBases - t2.length()) / 2), 1);
// And then design them
final byte[] bases = reference.getBases();
final int MAX_MASKED = designer.REPEAT_TOLERANCE;
for (int i = 1; i <= baitCount; ++i) {
int start = firstBaitStart + (baitOffset * (i - 1));
int end = CoordMath.getEnd(start, baitSize);
if (end > reference.length()) break;
// If there are too many soft masked bases try shifting it around
if (getMaskedBaseCount(bases, start - 1, end) > MAX_MASKED) {
final int maxMove = baitOffset * 3 / 4;
for (int move = 1; move <= maxMove; move++) {
// Move it "backwards?"
if (start - move >= 1 && getMaskedBaseCount(bases, start - move - 1, end - move) <= MAX_MASKED) {
start = start - move;
end = end - move;
break;
}
// Move it "forwards"?
if (end + move <= reference.length() && getMaskedBaseCount(bases, start + move - 1, end + move) <= MAX_MASKED) {
start = start + move;
end = end + move;
break;
}
}
}
final Bait bait = new Bait(t2.getContig(),
start,
end,
t2.isNegativeStrand(),
designer.makeBaitName(t2.getName(), i, baitCount));
bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND);
baits.add(bait);
}
return baits;
}
},
/**
* Ultra simple bait design algorithm that just lays down baits starting at the target start position
* until either the bait start runs off the end of the target or the bait would run off the sequence
*/
Simple {
@Override
List design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) {
final List baits = new LinkedList();
final int baitSize = designer.BAIT_SIZE;
final int baitOffset = designer.BAIT_OFFSET;
final int lastPossibleBaitStart = Math.min(target.getEnd(), reference.length() - baitSize);
final int baitCount = 1 + (int) Math.floor((lastPossibleBaitStart - target.getStart()) / (double) baitOffset);
int i = 0;
for (int start = target.getStart(); start < lastPossibleBaitStart; start += baitOffset) {
final Bait bait = new Bait(target.getContig(),
start,
CoordMath.getEnd(start, baitSize),
target.isNegativeStrand(),
designer.makeBaitName(target.getName(), ++i, baitCount));
bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND);
baits.add(bait);
}
return baits;
}
};
/** Design method that each Design Strategy must implement. */
abstract List design(BaitDesigner designer, Interval target, ReferenceSequence reference);
}
///////////////////////////////////////////////////////////////////////////
// Options for the Bait Designer
///////////////////////////////////////////////////////////////////////////
@Option(shortName = "T", doc = "The file with design parameters and targets")
public File TARGETS;
@Option(doc = "The name of the bait design")
public String DESIGN_NAME;
@Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "The reference sequence fasta file")
public File REFERENCE_SEQUENCE;
@Option(doc = "The left amplification primer to prepend to all baits for synthesis")
public String LEFT_PRIMER = "ATCGCACCAGCGTGT";
@Option(doc = "The right amplification primer to prepend to all baits for synthesis")
public String RIGHT_PRIMER = "CACTGCGGCTCCTCA";
@Option(doc = "The design strategy to use to layout baits across each target")
public DesignStrategy DESIGN_STRATEGY = DesignStrategy.FixedOffset;
@Option(doc = "The length of each individual bait to design")
public int BAIT_SIZE = 120;
@Option(doc = "The minimum number of baits to design per target.")
public int MINIMUM_BAITS_PER_TARGET = 2;
@Option(doc = "The desired offset between the start of one bait and the start of another bait for the same target.")
public int BAIT_OFFSET = 80;
@Option(doc = "Pad the input targets by this amount when designing baits. Padding is applied on both sides in this amount.")
public int PADDING = 0;
@Option(doc = "Baits that have more than REPEAT_TOLERANCE soft or hard masked bases will not be allowed")
public int REPEAT_TOLERANCE = 50;
@Option(doc = "The size of pools or arrays for synthesis. If no pool files are desired, can be set to 0.")
public int POOL_SIZE = 55000;
@Option(doc = "If true, fill up the pools with alternating fwd and rc copies of all baits. Equal copies of " +
"all baits will always be maintained")
public boolean FILL_POOLS = true;
@Option(doc = "If true design baits on the strand of the target feature, if false always design on the + strand of " +
"the genome.")
public boolean DESIGN_ON_TARGET_STRAND = false;
@Option(doc = "If true merge targets that are 'close enough' that designing against a merged target would be more efficient.")
public boolean MERGE_NEARBY_TARGETS = true;
@Option(doc = "If true also output .design.txt files per pool with one line per bait sequence")
public boolean OUTPUT_AGILENT_FILES = true;
@Option(shortName = "O", optional = true,
doc = "The output directory. If not provided then the DESIGN_NAME will be used as the output directory")
public File OUTPUT_DIRECTORY;
// "Output" members that will also get picked up by writeParametersFile()
int TARGET_TERRITORY;
int TARGET_COUNT;
int BAIT_TERRITORY;
int BAIT_COUNT;
int BAIT_TARGET_TERRITORY_INTERSECTION;
int ZERO_BAIT_TARGETS;
double DESIGN_EFFICIENCY;
// Utility objects
private static final Log log = Log.getInstance(BaitDesigner.class);
private final NumberFormat fmt = NumberFormat.getIntegerInstance();
/** Takes a target name and a bait index and creates a uniform bait name. */
String makeBaitName(final String targetName, final int baitIndex, final int totalBaits) {
final String total = fmt.format(totalBaits);
String bait = fmt.format(baitIndex);
// Pad out the bait number to match the longest one for this target
while (bait.length() < total.length()) bait = "0" + bait;
return targetName + "_bait#" + bait;
}
/** Returns the total of soft or hard masked bases in the interval of bases. */
public static int getMaskedBaseCount(final byte[] bases, final int from, final int until) {
int count = 0;
for (int i = from; i < until; i++) {
final byte b = bases[i];
if (b != 'A' && b != 'C' && b != 'G' && b != 'T') ++count;
}
return count;
}
/** Stock main method. */
public static void main(final String[] args) {
new BaitDesigner().instanceMainWithExit(args);
}
@Override
protected String[] customCommandLineValidation() {
final List errors = new ArrayList();
final Pattern p = Pattern.compile("^[ACGTacgt]*$");
if (LEFT_PRIMER != null && !p.matcher(LEFT_PRIMER).matches()) {
errors.add("Left primer " + LEFT_PRIMER + " is not a valid primer sequence.");
}
if (RIGHT_PRIMER != null && !p.matcher(RIGHT_PRIMER).matches()) {
errors.add("Right primer " + RIGHT_PRIMER + " is not a valid primer sequence.");
}
if (!errors.isEmpty()) return errors.toArray(new String[errors.size()]);
else return null;
}
int estimateBaits(final int start, final int end) {
final int length = end - start + 1;
return Math.max(MINIMUM_BAITS_PER_TARGET, (int) (Math.ceil(length - BAIT_SIZE) / (double) BAIT_OFFSET) + 1);
}
/**
* Main method that coordinates the checking of inputs, designing of baits and then
* the writing out of all necessary files.
*
* @return
*/
@Override
protected int doWork() {
// Input parameter munging and checking
if (OUTPUT_DIRECTORY == null) OUTPUT_DIRECTORY = new File(DESIGN_NAME);
IOUtil.assertFileIsReadable(TARGETS);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
if (!OUTPUT_DIRECTORY.exists()) {
OUTPUT_DIRECTORY.mkdirs();
}
IOUtil.assertDirectoryIsWritable(OUTPUT_DIRECTORY);
// Load up the targets and the reference
final IntervalList targets;
final IntervalList originalTargets = IntervalList.fromFile(TARGETS);
{
// Apply padding
final IntervalList padded = new IntervalList(originalTargets.getHeader());
final SAMSequenceDictionary dict = padded.getHeader().getSequenceDictionary();
for (final Interval i : originalTargets.getIntervals()) {
padded.add(new Interval(i.getContig(),
Math.max(i.getStart() - PADDING, 1),
Math.min(i.getEnd() + PADDING, dict.getSequence(i.getContig()).getSequenceLength()),
i.isNegativeStrand(),
i.getName()));
}
log.info("Starting with " + padded.size() + " targets.");
padded.uniqued();
log.info("After uniquing " + padded.size() + " targets remain.");
if (MERGE_NEARBY_TARGETS) {
final ListIterator iterator = padded.getIntervals().listIterator();
Interval previous = iterator.next();
targets = new IntervalList(padded.getHeader());
while (iterator.hasNext()) {
final Interval next = iterator.next();
if (previous.getContig().equals(next.getContig()) &&
estimateBaits(previous.getStart(), previous.getEnd()) + estimateBaits(next.getStart(), next.getEnd()) >=
estimateBaits(previous.getStart(), next.getEnd())) {
previous = new Interval(previous.getContig(),
previous.getStart(),
Math.max(previous.getEnd(), next.getEnd()),
previous.isNegativeStrand(),
previous.getName());
} else {
targets.add(previous);
previous = next;
}
}
if (previous != null) targets.add(previous);
log.info("After collapsing nearby targets " + targets.size() + " targets remain.");
} else {
targets = padded;
}
}
final ReferenceSequenceFileWalker referenceWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
// Check that the reference and the target list have matching headers
SequenceUtil.assertSequenceDictionariesEqual(referenceWalker.getSequenceDictionary(),
targets.getHeader().getSequenceDictionary());
// Design the baits!
int discardedBaits = 0;
final IntervalList baits = new IntervalList(targets.getHeader());
for (final Interval target : targets) {
final int sequenceIndex = targets.getHeader().getSequenceIndex(target.getContig());
final ReferenceSequence reference = referenceWalker.get(sequenceIndex);
for (final Bait bait : DESIGN_STRATEGY.design(this, target, reference)) {
if (bait.length() != BAIT_SIZE) {
throw new PicardException("Bait designed at wrong length: " + bait);
}
if (bait.getMaskedBaseCount() <= REPEAT_TOLERANCE) {
baits.add(bait);
for (final byte b : bait.getBases()) {
final byte upper = StringUtil.toUpperCase(b);
if (upper != 'A' && upper != 'C' && upper != 'G' && upper != 'T') {
log.warn("Bait contains non-synthesizable bases: " + bait);
}
}
} else {
log.debug("Discarding bait: " + bait);
discardedBaits++;
}
}
}
calculateStatistics(targets, baits);
log.info("Designed and kept " + baits.size() + " baits, discarded " + discardedBaits);
// Write out some files!
originalTargets.write(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".targets.interval_list"));
baits.write(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".baits.interval_list"));
writeParametersFile(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".design_parameters.txt"));
writeDesignFastaFile(new File(OUTPUT_DIRECTORY, DESIGN_NAME + ".design.fasta"), baits);
if (POOL_SIZE > 0) writePoolFiles(OUTPUT_DIRECTORY, DESIGN_NAME, baits);
return 0;
}
/** Calculates a few statistics about the bait design that can then be output. */
void calculateStatistics(final IntervalList targets, final IntervalList baits) {
this.TARGET_TERRITORY = (int) targets.getUniqueBaseCount();
this.TARGET_COUNT = targets.size();
this.BAIT_TERRITORY = (int) baits.getUniqueBaseCount();
this.BAIT_COUNT = baits.size();
this.DESIGN_EFFICIENCY = this.TARGET_TERRITORY / (double) this.BAIT_TERRITORY;
// Figure out the intersection between all targets and all baits
final IntervalList tmp = new IntervalList(targets.getHeader());
final OverlapDetector detector = new OverlapDetector(0, 0);
detector.addAll(baits.getIntervals(), baits.getIntervals());
for (final Interval target : targets) {
final Collection overlaps = detector.getOverlaps(target);
if (overlaps.isEmpty()) {
this.ZERO_BAIT_TARGETS++;
} else {
for (final Interval i : overlaps) tmp.add(target.intersect(i));
}
}
tmp.uniqued();
this.BAIT_TARGET_TERRITORY_INTERSECTION = (int) tmp.getBaseCount();
}
/** Method that writes out all the parameter values that were used in the design using reflection. */
void writeParametersFile(final File file) {
try {
final BufferedWriter out = IOUtil.openFileForBufferedWriting(file);
for (final Field field : getClass().getDeclaredFields()) {
if (Modifier.isPrivate(field.getModifiers())) continue;
final String name = field.getName();
if (name.toUpperCase().equals(name) && !name.equals("USAGE")) {
final Object value = field.get(this);
if (value != null) {
out.append(name);
out.append("=");
out.append(value.toString());
out.newLine();
}
}
}
out.close();
} catch (Exception e) {
throw new PicardException("Error writing out parameters file.", e);
}
}
void writeDesignFastaFile(final File file, final IntervalList baits) {
final BufferedWriter out = IOUtil.openFileForBufferedWriting(file);
for (final Interval i : baits) {
writeBaitFasta(out, i, false);
}
CloserUtil.close(out);
}
/** Writes a Bait out in fasta format to an output BufferedWriter. */
private void writeBaitFasta(final BufferedWriter out, final Interval i, final boolean rc) {
try {
final Bait bait = (Bait) i;
out.append(">");
out.append(bait.getName());
out.newLine();
final String sequence = getBaitSequence(bait, rc);
out.append(sequence);
out.newLine();
} catch (IOException ioe) {
throw new PicardException("Error writing out bait information.", ioe);
}
}
/** Gets the bait sequence, with primers, as a String, RC'd as appropriate. */
private String getBaitSequence(final Bait bait, final boolean rc) {
String sequence = (LEFT_PRIMER == null ? "" : LEFT_PRIMER) +
StringUtil.bytesToString(bait.getBases()) +
(RIGHT_PRIMER == null ? "" : RIGHT_PRIMER);
if (rc) sequence = SequenceUtil.reverseComplement(sequence);
return sequence;
}
/**
* Writes out fasta files for each pool and also agilent format files if requested.
*
* @param dir the directory to output files into
* @param basename the basename of each file
* @param baits the set of baits to write out
*/
void writePoolFiles(final File dir, final String basename, final IntervalList baits) {
final int copies;
if (FILL_POOLS && baits.size() < POOL_SIZE) copies = (int) Math.floor(POOL_SIZE / (double) baits.size());
else copies = 1;
int written = 0;
int nextPool = 0;
BufferedWriter out = null;
BufferedWriter agilentOut = null;
final String prefix = DESIGN_NAME.substring(0, Math.min(DESIGN_NAME.length(), 8)) + "_"; // prefix for 15 digit bait id
final NumberFormat fmt = new DecimalFormat("000000");
try {
for (int i = 0; i < copies; ++i) {
final boolean rc = i % 2 == 1;
int baitId = 1;
for (final Interval interval : baits) {
final Bait bait = (Bait) interval;
if (written++ % POOL_SIZE == 0) {
if (out != null) out.close();
if (agilentOut != null) agilentOut.close();
final String filename = basename + ".pool" + nextPool++ + ".design.";
out = IOUtil.openFileForBufferedWriting(new File(dir, filename + "fasta"));
if (OUTPUT_AGILENT_FILES) {
agilentOut = IOUtil.openFileForBufferedWriting(new File(dir, filename + "txt"));
}
}
writeBaitFasta(out, interval, rc);
if (OUTPUT_AGILENT_FILES) {
agilentOut.append(prefix).append(fmt.format(baitId++));
agilentOut.append("\t");
agilentOut.append(getBaitSequence(bait, rc).toUpperCase());
agilentOut.newLine();
}
}
}
CloserUtil.close(out);
CloserUtil.close(agilentOut);
} catch (Exception e) {
throw new PicardException("Error while writing pool files.", e);
}
}
}