
org.seqdoop.hadoop_bam.cli.plugins.FixMate Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2013 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// File created: 2013-06-18 10:26:55
package org.seqdoop.hadoop_bam.cli.plugins;
import hbparquet.hadoop.util.ContextUtil;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import htsjdk.samtools.SamPairUtil;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.ValidationStringency;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.seqdoop.hadoop_bam.AnySAMInputFormat;
import org.seqdoop.hadoop_bam.SAMRecordWritable;
import org.seqdoop.hadoop_bam.cli.CLIMRBAMPlugin;
import org.seqdoop.hadoop_bam.cli.CLIMergingAnySAMOutputFormat;
import org.seqdoop.hadoop_bam.cli.Utils;
import org.seqdoop.hadoop_bam.custom.jargs.gnu.CmdLineParser;
import org.seqdoop.hadoop_bam.custom.jargs.gnu.CmdLineParser.Option.BooleanOption;
import org.seqdoop.hadoop_bam.custom.jargs.gnu.CmdLineParser.Option.StringOption;
import org.seqdoop.hadoop_bam.util.Pair;
import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
import org.seqdoop.hadoop_bam.util.Timer;
public final class FixMate extends CLIMRBAMPlugin {
private static final List> optionDescs
= new ArrayList>();
private static final CmdLineParser.Option
sortOpt = new BooleanOption('s', "sort"),
noCombinerOpt = new BooleanOption('C', "no-combine"),
stringencyOpt = new StringOption("validation-stringency=S");
public FixMate() {
super("fixmate", "BAM and SAM mate information fixing", "1.1",
"WORKDIR INPATH [INPATH...]", optionDescs,
"Merges together the BAM and SAM files (the INPATHs), while filling "+
"in mate information, all distributed with Hadoop MapReduce. Output "+
"parts are placed in WORKDIR in, by default, headerless and "+
"unterminated BAM format."+
"\n\n"+
"When more than two primary reads with the same name exist in the "+
"inputs, the result is unpredictable. Without using the -C option, "+
"it is possible that multiple reads are mated to the same read.");
}
static {
optionDescs.add(new Pair(
sortOpt, "also globally sort the result by query name"));
optionDescs.add(new Pair(
noCombinerOpt, "don't use a combiner; less efficient, but "+
"guarantees validity of results when there are "+
"multiple possible pairings"));
optionDescs.add(new Pair(
stringencyOpt, Utils.getStringencyOptHelp()));
}
@Override protected int run(CmdLineParser parser) {
final List args = parser.getRemainingArgs();
if (args.isEmpty()) {
System.err.println("fixmate :: WORKDIR not given.");
return 3;
}
if (args.size() == 1) {
System.err.println("fixmate :: INPATH not given.");
return 3;
}
if (!cacheAndSetProperties(parser))
return 3;
final ValidationStringency stringency =
Utils.toStringency(parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate");
if (stringency == null)
return 3;
Path wrkDir = new Path(args.get(0));
final List strInputs = args.subList(1, args.size());
final List inputs = new ArrayList(strInputs.size());
for (final String in : strInputs)
inputs.add(new Path(in));
final Configuration conf = getConf();
// Used by Utils.getMergeableWorkFile() to name the output files.
final String intermediateOutName =
(outPath == null ? inputs.get(0) : outPath).getName();
conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);
if (stringency != null)
conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY,
stringency.toString());
final boolean globalSort = parser.getBoolean(sortOpt);
if (globalSort)
Utils.setHeaderMergerSortOrder(
conf, SAMFileHeader.SortOrder.queryname);
conf.setStrings(
Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));
final Timer t = new Timer();
try {
// Required for path ".", for example.
wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);
if (globalSort)
Utils.configureSampling(wrkDir, intermediateOutName, conf);
final Job job = new Job(conf);
job.setJarByClass (FixMate.class);
job.setMapperClass (FixMateMapper.class);
job.setReducerClass(FixMateReducer.class);
if (!parser.getBoolean(noCombinerOpt))
job.setCombinerClass(FixMateReducer.class);
job.setOutputKeyClass (Text.class);
job.setOutputValueClass(SAMRecordWritable.class);
job.setInputFormatClass (AnySAMInputFormat.class);
job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);
for (final Path in : inputs)
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, wrkDir);
if (globalSort) {
job.setPartitionerClass(TotalOrderPartitioner.class);
System.out.println("fixmate :: Sampling...");
t.start();
InputSampler.writePartitionFile(
job,
new InputSampler.RandomSampler(
0.01, 10000, Math.max(100, reduceTasks)));
System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n",
t.stopS(), t.fms());
}
job.submit();
System.out.println("fixmate :: Waiting for job completion...");
t.start();
if (!job.waitForCompletion(verbose)) {
System.err.println("fixmate :: Job failed.");
return 4;
}
System.out.printf("fixmate :: Job complete in %d.%03d s.\n",
t.stopS(), t.fms());
} catch (IOException e) {
System.err.printf("fixmate :: Hadoop error: %s\n", e);
return 4;
} catch (ClassNotFoundException e) { throw new RuntimeException(e); }
catch (InterruptedException e) { throw new RuntimeException(e); }
if (outPath != null) try {
Utils.mergeSAMInto(outPath, wrkDir,"","", samFormat, conf, "fixmate");
} catch (IOException e) {
System.err.printf("fixmate :: Output merging failed: %s\n", e);
return 5;
}
return 0;
}
}
final class FixMateMapper
extends Mapper
{
@Override protected void map(
LongWritable ignored, SAMRecordWritable wrec,
Mapper.Context
ctx)
throws InterruptedException, IOException
{
Utils.correctSAMRecordForMerging(wrec.get(), ContextUtil.getConfiguration(ctx));
ctx.write(new Text(wrec.get().getReadName()), wrec);
}
}
// Because this can be used as a combiner, we output the key instead of a
// NullWritable.
final class FixMateReducer
extends Reducer
{
private final SAMRecordWritable wrec = new SAMRecordWritable();
@Override protected void reduce(
Text key, Iterable records,
Reducer.Context ctx)
throws IOException, InterruptedException
{
// Non-primary records are simply written out, but as long as we can find
// two primaries, pair them up.
final SAMFileHeader header =
Utils.getSAMHeaderMerger(ContextUtil.getConfiguration(ctx)).getMergedHeader();
final Iterator it = records.iterator();
while (it.hasNext()) {
SAMRecordWritable a = it.next();
if (a.get().getNotPrimaryAlignmentFlag()) {
ctx.write(key, a);
continue;
}
// Cache the record since the iterator does its own caching, meaning
// that after another it.next() we would have a == b.
wrec.set(a.get());
a = wrec;
SAMRecordWritable b = null;
while (it.hasNext()) {
b = it.next();
if (!b.get().getNotPrimaryAlignmentFlag())
break;
ctx.write(key, b);
}
if (b == null) {
// No more primaries, so just write the unpaired one as-is.
ctx.write(key, a);
break;
}
a.get().setHeader(header);
b.get().setHeader(header);
SamPairUtil.setMateInfo(a.get(), b.get(), header);
ctx.write(key, a);
ctx.write(key, b);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy