All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seqdoop.hadoop_bam.AnySAMInputFormat Maven / Gradle / Ivy

Go to download

A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.

There is a newer version: 7.10.0
Show newest version
// Copyright (c) 2012 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2012-02-22 20:40:39

package org.seqdoop.hadoop_bam;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathNotFoundException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/** An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM, BAM, and CRAM files.
 * Values are the individual records; see {@link BAMRecordReader} for the
 * meaning of the key.
 *
 * 

By default, files are recognized as SAM, BAM, or CRAM based on their file * extensions: see {@link #TRUST_EXTS_PROPERTY}. If that fails, or this * behaviour is disabled, the first byte of each file is read to determine the * file type.

*/ public class AnySAMInputFormat extends FileInputFormat { /** A Boolean property: are file extensions trusted? The default is * true. * * @see SAMFormat#inferFromFilePath */ public static final String TRUST_EXTS_PROPERTY = "hadoopbam.anysam.trust-exts"; private final BAMInputFormat bamIF = new BAMInputFormat(); private final CRAMInputFormat cramIF = new CRAMInputFormat(); private final SAMInputFormat samIF = new SAMInputFormat(); private final Map formatMap; private final boolean givenMap; private Configuration conf; /** Creates a new input format, which will use the * Configuration from the first public method called. Thus this * will behave as though constructed with a Configuration * directly, but only after it has received it in * createRecordReader (via the TaskAttemptContext) * or isSplitable or getSplits (via the * JobContext). Until then, other methods will throw an {@link * IllegalStateException}. * * This constructor exists mainly as a convenience, e.g. so that * AnySAMInputFormat can be used directly in * Job.setInputFormatClass. */ public AnySAMInputFormat() { this(null, new HashMap<>(), false); } /** Creates a new input format, reading {@link #TRUST_EXTS_PROPERTY} from * the given Configuration. */ public AnySAMInputFormat(Configuration conf) { this(conf, new HashMap<>(), false); } private static boolean trustExtensions(Configuration conf) { return conf.getBoolean(TRUST_EXTS_PROPERTY, true); } /** Creates a new input format, trusting the given Map to * define the file-to-format associations. Neither file paths nor their * contents are looked at, only the Map is used. * *

The Map is not copied, so it should not be modified while * this input format is in use!

* */ public AnySAMInputFormat(Map formatMap) { this(null, formatMap, true); } private AnySAMInputFormat(Configuration conf, Map formatMap, boolean givenMap){ this.formatMap = formatMap; this.givenMap = givenMap; this.conf = conf; } /** Returns the {@link SAMFormat} corresponding to the given path. Returns * null if it cannot be determined even based on the file * contents (unless future SAM/BAM formats are very different, this means * that the path does not refer to a SAM or BAM file). * *

If this input format was constructed using a given * Map<Path,SAMFormat> and the path is not contained * within that map, throws an {@link IllegalArgumentException}.

*/ public SAMFormat getFormat(final Path path) throws PathNotFoundException { SAMFormat fmt = formatMap.get(path); if (fmt != null || formatMap.containsKey(path)) return fmt; if (givenMap) throw new IllegalArgumentException( "SAM format for '"+path+"' not in given map"); if (this.conf == null) throw new IllegalStateException("Don't have a Configuration yet"); if (trustExtensions(conf)) { final SAMFormat f = SAMFormat.inferFromFilePath(path); if (f != null) { formatMap.put(path, f); return f; } } try { FileSystem fileSystem = path.getFileSystem(conf); if (!fileSystem.exists(path)) { throw new PathNotFoundException(path.toString()); } fmt = SAMFormat.inferFromData(fileSystem.open(path)); } catch (IOException e) {} formatMap.put(path, fmt); return fmt; } /** Returns a {@link BAMRecordReader} or {@link SAMRecordReader} as * appropriate, initialized with the given parameters. * *

Throws {@link IllegalArgumentException} if the given input split is * not a {@link FileVirtualSplit} (used by {@link BAMInputFormat}) or a * {@link FileSplit} (used by {@link SAMInputFormat}), or if the path * referred to is not recognized as a SAM, BAM, or CRAM file (see {@link * #getFormat}).

*/ @Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final Path path; if (split instanceof FileSplit) path = ((FileSplit)split).getPath(); else if (split instanceof FileVirtualSplit) path = ((FileVirtualSplit)split).getPath(); else throw new IllegalArgumentException( "split '"+split+"' has unknown type: cannot extract path"); if (this.conf == null) this.conf = ctx.getConfiguration(); final SAMFormat fmt = getFormat(path); if (fmt == null) throw new IllegalArgumentException( "unknown SAM format, cannot create RecordReader: "+path); switch (fmt) { case SAM: return samIF.createRecordReader(split, ctx); case BAM: return bamIF.createRecordReader(split, ctx); case CRAM: return cramIF.createRecordReader(split, ctx); default: assert false; return null; } } /** Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or * {@link SAMInputFormat} as appropriate for the given path. */ @Override public boolean isSplitable(JobContext job, Path path) { if (this.conf == null) this.conf = job.getConfiguration(); try { final SAMFormat fmt = getFormat(path); if (fmt == null) return super.isSplitable(job, path); switch (fmt) { case SAM: return samIF.isSplitable(job, path); case BAM: return bamIF.isSplitable(job, path); case CRAM: return cramIF.isSplitable(job, path); default: assert false; return false; } } catch (PathNotFoundException e) { return super.isSplitable(job, path); } } /** Defers to {@link BAMInputFormat} or {@link CRAMInputFormat} as appropriate for each * individual path. SAM paths do not require special handling, so their splits are left * unchanged. */ @Override public List getSplits(JobContext job) throws IOException { if (this.conf == null) this.conf = job.getConfiguration(); final List origSplits = BAMInputFormat.removeIndexFiles(super.getSplits(job)); // We have to partition the splits by input format and hand them over to // the *InputFormats for any further handling. // // BAMInputFormat and CRAMInputFormat need to change the split boundaries, so we can // just extract the BAM and CRAM ones and leave the rest as they are. final List bamOrigSplits = new ArrayList(origSplits.size()), cramOrigSplits = new ArrayList(origSplits.size()), newSplits = new ArrayList(origSplits.size()); for (final InputSplit iSplit : origSplits) { final FileSplit split = (FileSplit)iSplit; if (SAMFormat.BAM.equals(getFormat(split.getPath()))) bamOrigSplits.add(split); else if (SAMFormat.CRAM.equals(getFormat(split.getPath()))) cramOrigSplits.add(split); else newSplits.add(split); } newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy