org.nmdp.ngs.tools.EvaluateScaffolds Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ngs-tools Show documentation
Show all versions of ngs-tools Show documentation
Next generation sequencing (NGS/HTS) command line tools.
/*
ngs-tools Next generation sequencing (NGS/HTS) command line tools.
Copyright (c) 2014-2015 National Marrow Donor Program (NMDP)
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
> http://www.gnu.org/licenses/lgpl.html
*/
package org.nmdp.ngs.tools;
import static com.google.common.base.Preconditions.checkNotNull;
import static org.dishevelled.compress.Readers.reader;
import static org.dishevelled.compress.Writers.writer;
import static org.nmdp.ngs.align.Blastn.blastn;
import java.io.BufferedReader;
import java.io.File;
import java.io.PrintWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import com.google.common.collect.ContiguousSet;
import com.google.common.collect.DiscreteDomain;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.dishevelled.commandline.ArgumentList;
import org.dishevelled.commandline.CommandLine;
import org.dishevelled.commandline.CommandLineParseException;
import org.dishevelled.commandline.CommandLineParser;
import org.dishevelled.commandline.Switch;
import org.dishevelled.commandline.Usage;
import org.dishevelled.commandline.argument.FileArgument;
import org.nmdp.ngs.align.HighScoringPair;
/**
* Evaluate assembly scaffolds against a reference sequence.
*/
@SuppressWarnings("deprecation")
public final class EvaluateScaffolds implements Callable {
private final File referenceFastaFile;
private final File scaffoldsFastaFile;
private final File evalFile;
private static final String USAGE = "ngs-evaluate-scaffolds -r reference.fa.gz -s scaffolds.fa.gz";
/**
* Evaluate assembly scaffolds against a reference sequence.
*
* @param referenceFastaFile input reference FASTA file, must not be null
* @param scaffoldsFastaFile input scaffolds FASTA file, must not be null
* @param evalFile output eval file, if any
*/
public EvaluateScaffolds(final File referenceFastaFile, final File scaffoldsFastaFile, final File evalFile) {
checkNotNull(referenceFastaFile);
checkNotNull(scaffoldsFastaFile);
this.referenceFastaFile = referenceFastaFile;
this.scaffoldsFastaFile = scaffoldsFastaFile;
this.evalFile = evalFile;
}
@Override
public Integer call() throws Exception {
PrintWriter writer = null;
try {
writer = writer(evalFile);
Sequence reference = readReference();
List scaffolds = readScaffolds();
writer.println("#reference length = " + reference.length());
writer.println("#scaffold count = " + scaffolds.size());
writer.println("#scaffold lengths = " + dumpLengths(scaffolds));
RangeSet ranges = TreeRangeSet.create();
for (HighScoringPair hsp : blastn(referenceFastaFile, scaffoldsFastaFile)) {
if (reference.getName().equals(hsp.target())) {
writer.println("#" + hsp.toString());
if (hsp.targetStart() <= hsp.targetEnd()) { // strands match
ranges.add(Range.closed(hsp.targetStart(), hsp.targetEnd()));
}
else {
ranges.add(Range.closed(hsp.targetEnd(), hsp.targetStart()));
}
}
}
writer.println("#coalesced intervals = " + ranges);
long breadthOfCoverage = 0;
for (Range range : ranges.asRanges()) {
breadthOfCoverage += ContiguousSet.create(range, DiscreteDomain.longs()).size();
}
double normalizedBreadthOfCoverage = (double) breadthOfCoverage / (double) reference.length();
writer.println("#breadth-of-coverage = " + breadthOfCoverage);
writer.println("#normalized breadth-of-coverage = " + normalizedBreadthOfCoverage);
StringBuilder sb = new StringBuilder();
sb.append(referenceFastaFile.getName());
sb.append("\t");
sb.append(scaffoldsFastaFile.getName());
sb.append("\t");
sb.append(reference.length());
sb.append("\t");
sb.append(scaffolds.size());
sb.append("\t");
sb.append(ranges.asRanges().size());
sb.append("\t");
sb.append(breadthOfCoverage);
sb.append("\t");
sb.append(normalizedBreadthOfCoverage);
sb.append("\t");
writer.println(sb.toString());
return 0;
}
finally {
try {
writer.close();
}
catch (Exception e) {
// ignore
}
}
}
private Sequence readReference() throws IOException, BioException {
Sequence sequence = null;
BufferedReader reader = null;
try {
reader = reader(referenceFastaFile);
SequenceIterator sequences = SeqIOTools.readFastaDNA(reader);
if (!sequences.hasNext()) {
throw new IOException("reference FASTA file was empty");
}
sequence = sequences.nextSequence();
if (sequences.hasNext()) {
throw new IOException("reference FASTA file contains more than one sequence");
}
}
finally {
try {
reader.close();
}
catch (Exception e) {
// ignore
}
}
return sequence;
}
private List readScaffolds() throws IOException, BioException {
BufferedReader reader = null;
List scaffolds = Lists.newLinkedList();
try {
reader = reader(scaffoldsFastaFile);
for (SequenceIterator sequences = SeqIOTools.readFastaDNA(reader); sequences.hasNext(); ) {
Sequence sequence = sequences.nextSequence();
scaffolds.add(sequence);
}
}
finally {
try {
reader.close();
}
catch (Exception e) {
// ignore
}
}
return scaffolds;
}
private static String dumpLengths(final List scaffolds) {
StringBuilder sb = new StringBuilder();
Iterator iterator = scaffolds.iterator();
sb.append(iterator.next().length());
while (iterator.hasNext()) {
sb.append(", ");
sb.append(iterator.next().length());
}
return sb.toString();
}
/**
* Main.
*
* @param args command line args
*/
public static void main(final String[] args) {
Switch about = new Switch("a", "about", "display about message");
Switch help = new Switch("h", "help", "display help message");
FileArgument referenceFastaFile = new FileArgument("r", "reference", "input reference FASTA file", true);
FileArgument scaffoldsFastaFile = new FileArgument("s", "scaffolds", "input scaffolds FASTA file", true);
FileArgument evalFile = new FileArgument("e", "eval", "output eval file, default stdout", false);
ArgumentList arguments = new ArgumentList(about, help, referenceFastaFile, scaffoldsFastaFile, evalFile);
CommandLine commandLine = new CommandLine(args);
EvaluateScaffolds evaluateScaffolds = null;
try {
CommandLineParser.parse(commandLine, arguments);
if (about.wasFound()) {
About.about(System.out);
System.exit(0);
}
if (help.wasFound()) {
Usage.usage(USAGE, null, commandLine, arguments, System.out);
System.exit(0);
}
evaluateScaffolds = new EvaluateScaffolds(referenceFastaFile.getValue(), scaffoldsFastaFile.getValue(), evalFile.getValue());
}
catch (CommandLineParseException e) {
if (about.wasFound()) {
About.about(System.out);
System.exit(0);
}
if (help.wasFound()) {
Usage.usage(USAGE, null, commandLine, arguments, System.out);
System.exit(0);
}
Usage.usage(USAGE, e, commandLine, arguments, System.err);
System.exit(-1);
}
try {
System.exit(evaluateScaffolds.call());
}
catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}