
org.seqdoop.hadoop_bam.cli.Utils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2011 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// File created: 2011-06-14 14:52:20
package org.seqdoop.hadoop_bam.cli;
import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.CodeSource;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import htsjdk.samtools.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import org.seqdoop.hadoop_bam.SAMFormat;
import org.seqdoop.hadoop_bam.util.SAMOutputPreparer;
import org.seqdoop.hadoop_bam.util.Timer;
import hbparquet.hadoop.util.ContextUtil;
public final class Utils {
public static void printWrapped(PrintStream out, String str) {
printWrapped(out, str, 0);
}
public static void printWrapped(PrintStream out, String str, int indent) {
printWrapped(out, str, indent, 80);
}
public static void printWrapped(
PrintStream out, String str, int indent, int wrapAt)
{
int pos = indent;
final Scanner words = new Scanner(str);
// Print the first word out here to avoid a spurious line break if it
// would wrap.
if (words.hasNext()) {
final String word = words.next();
out.print(word);
pos += word.length();
}
boolean addSpace = true;
while (words.hasNext()) {
final String word = words.next();
final int wend = words.match().end();
pos += word.length();
if (addSpace)
++pos;
if (pos < wrapAt) {
if (addSpace)
out.print(' ');
out.print(word);
} else {
pos = indent + word.length();
out.print('\n');
for (int i = indent; i-- > 0;)
out.print(' ');
out.print(word);
}
if (wend < str.length() && str.charAt(wend) == '\n') {
pos = indent;
addSpace = false;
int i = wend;
do
out.print('\n');
while (++i < str.length() && str.charAt(i) == '\n');
for (i = indent; i-- > 0;)
out.print(' ');
} else
addSpace = true;
}
out.print('\n');
}
private static String argv0 = null;
private static Class> argv0Class = null;
// For printing something intelligent in "Usage: argv0 " messages.
public static String getArgv0() {
if (argv0 != null)
return argv0;
if (argv0Class == null)
return null;
final CodeSource cs =
argv0Class.getProtectionDomain().getCodeSource();
if (cs == null)
return null;
final String path = cs.getLocation().getPath();
if (path.endsWith("/")) {
// Typically (always?) a .class file loaded from the directory in
// path.
argv0 = argv0Class.getSimpleName();
} else {
// Typically (always?) a .jar file.
argv0 = new File(path).getName();
}
return argv0;
}
public static void setArgv0Class(Class> cl) {
argv0Class = cl;
argv0 = null;
}
public static void configureSampling(
Path workDir, String outName, Configuration conf)
throws IOException
{
final Path partition =
workDir.getFileSystem(conf).makeQualified(
new Path(workDir, "_partitioning" + outName));
TotalOrderPartitioner.setPartitionFile(conf, partition);
try {
final URI partitionURI = new URI(
partition.toString() + "#" + partition.getName());
if (partitionURI.getScheme().equals("file"))
return;
DistributedCache.addCacheFile(partitionURI, conf);
DistributedCache.createSymlink(conf);
} catch (URISyntaxException e) { throw new RuntimeException(e); }
}
public static final String WORK_FILENAME_PROPERTY =
"hadoopbam.work.filename";
/** Returns a name that mergeInto() will recognize as a file to be merged.
*
* The filename is the value of WORK_FILENAME_PROPERTY surrounded by
* basePrefix and basePostfix, followed by the part number and with the
* given extension.
*/
public static Path getMergeableWorkFile(
Path directory, String basePrefix, String basePostfix,
TaskAttemptContext ctx, String extension)
{
return new Path(
directory,
basePrefix
+ ContextUtil.getConfiguration(ctx).get(WORK_FILENAME_PROPERTY)
+ basePostfix
+ "-"
+ String.format("%06d", ctx.getTaskAttemptID().getTaskID().getId())
+ (extension.isEmpty() ? extension : "." + extension));
}
/** Merges the files in the given directory that have names given by
* getMergeableWorkFile() into out.
*
* Outputs progress reports if commandName is non-null.
*/
public static void mergeInto(
OutputStream out,
Path directory, String basePrefix, String basePostfix,
Configuration conf, String commandName)
throws IOException
{
final FileSystem fs = directory.getFileSystem(conf);
final FileStatus[] parts = fs.globStatus(new Path(
directory,
basePrefix + conf.get(WORK_FILENAME_PROPERTY) + basePostfix
+ "-[0-9][0-9][0-9][0-9][0-9][0-9]*"));
int i = 0;
Timer t = new Timer();
for (final FileStatus part : parts) {
if (commandName != null) {
System.out.printf("%s :: Merging part %d (size %d)...",
commandName, ++i, part.getLen());
System.out.flush();
t.start();
}
final InputStream in = fs.open(part.getPath());
IOUtils.copyBytes(in, out, conf, false);
in.close();
if (commandName != null)
System.out.printf(" done in %d.%03d s.\n", t.stopS(), t.fms());
}
for (final FileStatus part : parts)
fs.delete(part.getPath(), false);
}
private static final String
HEADERMERGER_SORTORDER_PROP = "hadoopbam.headermerger.sortorder";
public static void setHeaderMergerSortOrder(
Configuration conf, SAMFileHeader.SortOrder order)
{
conf.set(HEADERMERGER_SORTORDER_PROP, order.name());
}
public static final String
HEADERMERGER_INPUTS_PROPERTY = "hadoopbam.headermerger.inputs";
private static SamFileHeaderMerger headerMerger = null;
/** Computes the merger of the SAM headers in the files listed in
* HEADERMERGER_INPUTS_PROPERTY. The sort order of the result is set
* according to the last call to setHeaderMergerSortOrder, or otherwise
* to "unsorted".
*
* The result is cached locally to prevent it from being recomputed too
* often.
*/
public static SamFileHeaderMerger getSAMHeaderMerger(Configuration conf)
throws IOException
{
// TODO: it would be preferable to cache this beforehand instead of
// having every task read the header block of every input file. But that
// would be trickier, given that SamFileHeaderMerger isn't trivially
// serializable.
// Save it in a static field, though, in case that helps anything.
if (headerMerger != null)
return headerMerger;
final List headers = new ArrayList();
for (final String in : conf.getStrings(HEADERMERGER_INPUTS_PROPERTY)) {
final Path p = new Path(in);
final SamReader r = SamReaderFactory.makeDefault().open(
SamInputResource.of(p.getFileSystem(conf).open(p)));
headers.add(r.getFileHeader());
r.close();
}
final String orderStr = conf.get(HEADERMERGER_SORTORDER_PROP);
final SAMFileHeader.SortOrder order =
orderStr == null ? SAMFileHeader.SortOrder.unsorted
: SAMFileHeader.SortOrder.valueOf(orderStr);
return headerMerger = new SamFileHeaderMerger(order, headers, true);
}
/** Changes the given SAMRecord as appropriate for being placed in a file
* whose header is getSAMHeaderMerger(conf).getMergedHeader().
*/
public static void correctSAMRecordForMerging(
SAMRecord r, Configuration conf)
throws IOException
{
if (headerMerger == null)
getSAMHeaderMerger(conf);
final SAMFileHeader h = r.getHeader();
// Correct the reference indices, and thus the key, if necessary.
if (headerMerger.hasMergedSequenceDictionary()) {
final int ri = headerMerger.getMergedSequenceIndex(
h, r.getReferenceIndex());
r.setReferenceIndex(ri);
if (r.getReadPairedFlag())
r.setMateReferenceIndex(headerMerger.getMergedSequenceIndex(
h, r.getMateReferenceIndex()));
}
// Correct the program group if necessary.
if (headerMerger.hasProgramGroupCollisions()) {
final String pg = (String)r.getAttribute(
ReservedTagConstants.PROGRAM_GROUP_ID);
if (pg != null)
r.setAttribute(
ReservedTagConstants.PROGRAM_GROUP_ID,
headerMerger.getProgramGroupId(h, pg));
}
// Correct the read group if necessary.
if (headerMerger.hasReadGroupCollisions()) {
final String rg = (String)r.getAttribute(
ReservedTagConstants.READ_GROUP_ID);
if (rg != null)
r.setAttribute(
ReservedTagConstants.READ_GROUP_ID,
headerMerger.getProgramGroupId(h, rg));
}
}
/** Merges the files in the given directory that have names given by
* getMergeableWorkFile() into out in the given SAMFormat, using
* getSAMHeaderMerger().getMergedHeader() as the header.
*
* Outputs progress reports if commandName is non-null.
*/
public static void mergeSAMInto(
Path out, Path directory, String basePrefix, String basePostfix,
SAMFormat format, Configuration conf, String commandName)
throws IOException
{
Timer t = new Timer();
System.out.printf("%s :: Merging output...\n", commandName);
t.start();
final OutputStream outs = out.getFileSystem(conf).create(out);
// First, place the SAM or BAM header.
//
// Don't use the returned stream, because we're concatenating directly
// and don't want to apply another layer of compression to BAM.
new SAMOutputPreparer().prepareForRecords(
outs, format, getSAMHeaderMerger(conf).getMergedHeader());
// Then, the actual SAM or BAM contents.
mergeInto(outs, directory, basePrefix, basePostfix, conf, commandName);
// And if BAM, the BGZF terminator.
if (format == SAMFormat.BAM)
outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
outs.close();
System.out.printf("%s :: Merging complete in %d.%03d s.\n",
commandName, t.stopS(), t.fms());
}
private static String stringencyOptHelp = null;
public static String getStringencyOptHelp() {
if (stringencyOptHelp != null)
return stringencyOptHelp;
final StringBuilder sb = new StringBuilder(
"set Picard's validation stringency to S (");
String last = null;
for (final ValidationStringency v
: ValidationStringency.values())
{
if (last != null) {
sb.append(last);
sb.append(", ");
}
last = v.name();
}
sb.append("or ");
sb.append(last);
sb.append(')');
return stringencyOptHelp = sb.toString();
}
/** Sets the default validation stringency in addition to returning it. */
@SuppressWarnings("deprecation")
public static ValidationStringency toStringency(
Object o, String cmdName)
{
final String s = (String)o;
try {
final ValidationStringency stringency =
ValidationStringency.valueOf(s);
SAMFileReader.setDefaultValidationStringency(stringency);
return stringency;
} catch (IllegalArgumentException e) {
System.err.printf("%s :: invalid stringency '%s'\n", cmdName, s);
return null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy