All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seqdoop.hadoop_bam.VCFInputFormat Maven / Gradle / Ivy

Go to download

A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.

There is a newer version: 7.10.0
Show newest version
// Copyright (c) 2013 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2013-06-26 12:47:26

package org.seqdoop.hadoop_bam;

import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.index.Block;
import htsjdk.tribble.index.tabix.TabixIndex;
import htsjdk.tribble.util.TabixUtils;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;

import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import htsjdk.samtools.seekablestream.SeekableStream;

import org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec;
import org.seqdoop.hadoop_bam.util.BGZFCodec;
import org.seqdoop.hadoop_bam.util.IntervalUtil;
import org.seqdoop.hadoop_bam.util.WrapSeekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** An {@link org.apache.hadoop.mapreduce.InputFormat} for VCF files. Values
 * are the individual records; see {@link VCFRecordReader} for the meaning of
 * the key.
 */
public class VCFInputFormat
	extends FileInputFormat
{
	private static final Logger logger = LoggerFactory.getLogger(VCFInputFormat.class);

	/** Whether file extensions are to be trusted, defaults to true.
	 *
	 * @see VCFFormat#inferFromFilePath
	 */

	public static final String TRUST_EXTS_PROPERTY =
		"hadoopbam.vcf.trust-exts";

	/**
	 * Filter by region, like -L in SAMtools. Takes a comma-separated
	 * list of intervals, e.g. chr1:1-20000,chr2:12000-20000. For
	 * programmatic use {@link #setIntervals(Configuration, List)} should be preferred.
	 */
	public static final String INTERVALS_PROPERTY = "hadoopbam.vcf.intervals";

	public static  void setIntervals(Configuration conf,
			List intervals) {
		StringBuilder sb = new StringBuilder();
		for (Iterator it = intervals.iterator(); it.hasNext(); ) {
			Locatable l = it.next();
			sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd()));
			if (it.hasNext()) {
				sb.append(",");
			}
		}
		conf.set(INTERVALS_PROPERTY, sb.toString());
	}

	static List getIntervals(Configuration conf) {
		return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY);
	}

	private final Map formatMap;
	private final boolean              givenMap;

	private Configuration conf;
	private boolean trustExts;

	/** Creates a new input format, which will use the
	 * Configuration from the first public method called. Thus this
	 * will behave as though constructed with a Configuration
	 * directly, but only after it has received it in
	 * createRecordReader (via the TaskAttemptContext)
	 * or isSplitable or getSplits (via the
	 * JobContext). Until then, other methods will throw an {@link
	 * IllegalStateException}.
	 *
	 * This constructor exists mainly as a convenience, e.g. so that
	 * VCFInputFormat can be used directly in
	 * Job.setInputFormatClass.
	 */
	public VCFInputFormat() {
		this.formatMap = new HashMap();
		this.givenMap  = false;
		this.conf      = null;
	}

	/** Creates a new input format, reading {@link #TRUST_EXTS_PROPERTY} from
	 * the given Configuration.
	 */
	public VCFInputFormat(Configuration conf) {
		this.formatMap = new HashMap();
		this.conf      = conf;
		this.trustExts = conf.getBoolean(TRUST_EXTS_PROPERTY, true);
		this.givenMap  = false;
	}

	/** Creates a new input format, trusting the given Map to
	 * define the file-to-format associations. Neither file paths nor their
	 * contents are looked at, only the Map is used.
	 *
	 * 

The Map is not copied, so it should not be modified while * this input format is in use!

* */ public VCFInputFormat(Map formatMap) { this.formatMap = formatMap; this.givenMap = true; // Arbitrary values. this.conf = null; this.trustExts = false; } /** Returns the {@link VCFFormat} corresponding to the given path. Returns * null if it cannot be determined even based on the file * contents (unless future VCF/BCF formats are very different, this means * that the path does not refer to a VCF or BCF file). * *

If this input format was constructed using a given * Map<Path,VCFFormat> and the path is not contained * within that map, throws an {@link IllegalArgumentException}.

*/ public VCFFormat getFormat(final Path path) { VCFFormat fmt = formatMap.get(path); if (fmt != null || formatMap.containsKey(path)) return fmt; if (givenMap) throw new IllegalArgumentException( "VCF format for '"+path+"' not in given map"); if (this.conf == null) throw new IllegalStateException("Don't have a Configuration yet"); if (trustExts) { final VCFFormat f = VCFFormat.inferFromFilePath(path); if (f != null) { formatMap.put(path, f); return f; } } try { fmt = VCFFormat.inferFromData(path.getFileSystem(conf).open(path)); } catch (IOException e) {} formatMap.put(path, fmt); return fmt; } @Override protected boolean isSplitable(JobContext context, Path filename) { Configuration conf = context.getConfiguration(); final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); if (codec == null) { return true; } if (codec instanceof BGZFCodec || codec instanceof BGZFEnhancedGzipCodec) { boolean splittable; try { try (FSDataInputStream in = filename.getFileSystem(conf).open(filename)) { splittable = BlockCompressedInputStream.isValidFile(new BufferedInputStream(in)); } } catch (IOException e) { // can't determine if BGZF or GZIP, conservatively assume latter splittable = false; } if (!splittable) { logger.warn("{} is not splittable, consider using block-compressed gzip (BGZF)", filename); } return splittable; } else if (codec instanceof GzipCodec) { logger.warn("Using GzipCodec, which is not splittable, consider using block compressed gzip (BGZF) and BGZFCodec/BGZFEnhancedGzipCodec."); } return codec instanceof SplittableCompressionCodec; } /** Returns a {@link BCFRecordReader} or {@link VCFRecordReader} as * appropriate, initialized with the given parameters. * *

Throws {@link IllegalArgumentException} if the given input split is * not a {@link FileVirtualSplit} or a {@link FileSplit}, or if the path * referred to is not recognized as a VCF or BCF file (see {@link * #getFormat}).

*/ @Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final Path path; if (split instanceof FileSplit) path = ((FileSplit)split).getPath(); else if (split instanceof FileVirtualSplit) path = ((FileVirtualSplit)split).getPath(); else throw new IllegalArgumentException( "split '"+split+"' has unknown type: cannot extract path"); if (this.conf == null) this.conf = ctx.getConfiguration(); final VCFFormat fmt = getFormat(path); if (fmt == null) throw new IllegalArgumentException( "unknown VCF format, cannot create RecordReader: "+path); final RecordReader rr; switch (fmt) { case VCF: rr = new VCFRecordReader(); break; case BCF: rr = new BCFRecordReader(); break; default: assert false; return null; } rr.initialize(split, ctx); return rr; } /** Defers to {@link BCFSplitGuesser} as appropriate for each individual * path. VCF paths do not require special handling, so their splits are left * unchanged. */ @Override public List getSplits(JobContext job) throws IOException { if (this.conf == null) this.conf = job.getConfiguration(); final List origSplits = super.getSplits(job); // We have to partition the splits by input format and hand the BCF ones // over to getBCFSplits(). final List bcfOrigSplits = new ArrayList(origSplits.size()); final List newSplits = new ArrayList(origSplits.size()); for (final InputSplit iSplit : origSplits) { final FileSplit split = (FileSplit)iSplit; if (VCFFormat.BCF.equals(getFormat(split.getPath()))) bcfOrigSplits.add(split); else newSplits.add(split); } fixBCFSplits(bcfOrigSplits, newSplits); return filterByInterval(newSplits, conf); } // The given FileSplits should all be for BCF files. Adds InputSplits // aligned to record boundaries. Compressed BCF results in // FileVirtualSplits, uncompressed in FileSplits. private void fixBCFSplits( List splits, List newSplits) throws IOException { // addGuessedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. Collections.sort(splits, new Comparator() { public int compare(FileSplit a, FileSplit b) { return a.getPath().compareTo(b.getPath()); } }); for (int i = 0; i < splits.size();) i = addGuessedSplits(splits, i, newSplits); } // Handles all the splits that share the Path of the one at index i, // returning the next index to be used. private int addGuessedSplits( List splits, int i, List newSplits) throws IOException { final Path path = splits.get(i).getPath(); final SeekableStream sin = WrapSeekable.openPath(conf, path); final BCFSplitGuesser guesser = new BCFSplitGuesser(sin); final boolean isBGZF = guesser.isBGZF(); InputSplit prevSplit = null; for (; i < splits.size(); ++i) { final FileSplit fspl = splits.get(i); if (!fspl.getPath().equals(path)) break; final String[] locs = fspl.getLocations(); final long beg = fspl.getStart(); final long end = beg + fspl.getLength(); final long alignBeg = guesser.guessNextBCFRecordStart(beg, end); // As the guesser goes to the next BGZF block before looking for BCF // records, the ending BGZF blocks have to always be traversed fully. // Hence force the length to be 0xffff, the maximum possible. final long alignEnd = isBGZF ? end << 16 | 0xffff : end; final long length = alignEnd - alignBeg; if (alignBeg == end) { // No records detected in this split: merge it to the previous one. // This could legitimately happen e.g. if we have a split that is // so small that it only contains the middle part of a BGZF block. // // Of course, if it's the first split, then this is simply not a // valid BCF file. // // FIXME: In theory, any number of splits could only contain parts // of the BCF header before we start to see splits that contain BCF // records. For now, we require that the split size is at least as // big as the header and don't handle that case. if (prevSplit == null) throw new IOException("'" + path + "': no records in first "+ "split: bad BCF file or tiny split size?"); if (isBGZF) { ((FileVirtualSplit)prevSplit).setEndVirtualOffset(alignEnd); continue; } prevSplit = new FileSplit(path, alignBeg, length, locs); newSplits.remove(newSplits.size() - 1); } else { prevSplit = isBGZF ? new FileVirtualSplit(path, alignBeg, alignEnd, locs) : new FileSplit (path, alignBeg, length, locs); } newSplits.add(prevSplit); } sin.close(); return i; } private List filterByInterval(List splits, Configuration conf) throws IOException { List intervals = getIntervals(conf); if (intervals == null) { return splits; } List blocks = new ArrayList<>(); Set vcfFiles = new LinkedHashSet(); for (InputSplit split : splits) { if (split instanceof FileSplit) { vcfFiles.add(((FileSplit) split).getPath()); } else if (split instanceof FileVirtualSplit) { vcfFiles.add(((FileVirtualSplit) split).getPath()); } else { throw new IllegalArgumentException( "split '"+split+"' has unknown type: cannot extract path"); } } for (Path vcfFile : vcfFiles) { Path indexFile = vcfFile.suffix(TabixUtils.STANDARD_INDEX_EXTENSION); FileSystem fs = vcfFile.getFileSystem(conf); if (!fs.exists(indexFile)) { logger.warn( "No tabix index file found for {}, splits will not be filtered, which may be very inefficient", indexFile); return splits; } try (InputStream in = new BlockCompressedInputStream(fs.open(indexFile))) { TabixIndex index = new TabixIndex(in); for (Locatable interval : intervals) { String contig = interval.getContig(); int intervalStart = interval.getStart(); int intervalEnd = interval.getEnd(); blocks.addAll(index.getBlocks(contig, intervalStart, intervalEnd)); } } } // Use the blocks to filter the splits List filteredSplits = new ArrayList(); for (InputSplit split : splits) { if (split instanceof FileSplit) { FileSplit fileSplit = (FileSplit) split; long splitStart = fileSplit.getStart() << 16; long splitEnd = (fileSplit.getStart() + fileSplit.getLength()) << 16; // if any block overlaps with the split, keep the split, but don't adjust its size // as the BGZF block decompression is handled by BGZFCodec, not by the reader // directly for (Block block : blocks) { long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { filteredSplits.add(split); break; } } } else { FileVirtualSplit virtualSplit = (FileVirtualSplit) split; long splitStart = virtualSplit.getStartVirtualOffset(); long splitEnd = virtualSplit.getEndVirtualOffset(); // if any block overlaps with the split, keep the split, but adjust the start and // end to the maximally overlapping portion for all blocks that overlap long newStart = Long.MAX_VALUE; long newEnd = Long.MIN_VALUE; boolean overlaps = false; for (Block block : blocks) { long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { long overlapStart = Math.max(splitStart, blockStart); long overlapEnd = Math.min(splitEnd, blockEnd); newStart = Math.min(newStart, overlapStart); newEnd = Math.max(newEnd, overlapEnd); overlaps = true; } } if (overlaps) { filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), newStart, newEnd, virtualSplit.getLocations())); } } } return filteredSplits; } private static boolean overlaps(long start, long end, long start2, long end2) { return (start2 >= start && start2 <= end) || (end2 >=start && end2 <= end) || (start >= start2 && end <= end2); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy