All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seqdoop.hadoop_bam.BAMInputFormat Maven / Gradle / Ivy

Go to download

A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.

There is a newer version: 7.10.0
Show newest version
// Copyright (c) 2010 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2010-08-03 11:50:19

package org.seqdoop.hadoop_bam;

import htsjdk.samtools.AbstractBAMFileIndex;
import htsjdk.samtools.BAMFileReader;
import htsjdk.samtools.BAMFileSpan;
import htsjdk.samtools.BAMIndex;
import htsjdk.samtools.Chunk;
import htsjdk.samtools.LinearBAMIndex;
import htsjdk.samtools.LinearIndex;
import htsjdk.samtools.QueryInterval;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileSpan;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Locatable;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.seqdoop.hadoop_bam.util.IntervalUtil;
import org.seqdoop.hadoop_bam.util.NIOFileUtil;
import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
import org.seqdoop.hadoop_bam.util.WrapSeekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.file.ProviderNotFoundException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import htsjdk.samtools.seekablestream.SeekableStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/** An {@link org.apache.hadoop.mapreduce.InputFormat} for BAM files. Values
 * are the individual records; see {@link BAMRecordReader} for the meaning of
 * the key.
 */
public class BAMInputFormat
	extends FileInputFormat
{
	private static final Logger logger = LoggerFactory.getLogger(BAMInputFormat.class);

	/**
	 * If set to true, only include reads that overlap the given intervals (if specified),
	 * and unplaced unmapped reads (if specified). For programmatic use
	 * {@link #setTraversalParameters(Configuration, List, boolean)} should be preferred.
	 */
	public static final String BOUNDED_TRAVERSAL_PROPERTY = "hadoopbam.bam.bounded-traversal";

	/**
	 * If set to true, enables the use of BAM indices to calculate splits.
	 * For programmatic use
	 * {@link #setEnableBAISplitCalculator(Configuration, boolean)} should be preferred.
         * By default, this split calculator is disabled in favor of the splitting-bai calculator.
	 */
	public static final String ENABLE_BAI_SPLIT_CALCULATOR = "hadoopbam.bam.enable-bai-splitter";
    
	/**
	 * Filter by region, like -L in SAMtools. Takes a comma-separated
	 * list of intervals, e.g. chr1:1-20000,chr2:12000-20000. For
	 * programmatic use {@link #setIntervals(Configuration, List)} should be preferred.
	 */
	public static final String INTERVALS_PROPERTY = "hadoopbam.bam.intervals";

	/**
	 * If set to true, include unplaced unmapped reads (that is, unmapped reads with no
	 * position). For programmatic use
	 * {@link #setTraversalParameters(Configuration, List, boolean)} should be preferred.
	 */
	public static final String TRAVERSE_UNPLACED_UNMAPPED_PROPERTY = "hadoopbam.bam.traverse-unplaced-unmapped";

	/**
	 * Only include reads that overlap the given intervals. Unplaced unmapped reads are not
	 * included.
	 * @param conf the Hadoop configuration to set properties on
	 * @param intervals the intervals to filter by
	 * @param  the {@link Locatable} type
	 */
	public static  void setIntervals(Configuration conf,
			List intervals) {
		setTraversalParameters(conf, intervals, false);
	}

        /**
         * Enables or disables the split calculator that uses the BAM index to calculate splits.
         */
        public static void setEnableBAISplitCalculator(Configuration conf,
                        boolean setEnabled) {
            conf.setBoolean(ENABLE_BAI_SPLIT_CALCULATOR, setEnabled);
        }

	/**
	 * Only include reads that overlap the given intervals (if specified) and unplaced
	 * unmapped reads (if true).
	 * @param conf the Hadoop configuration to set properties on
	 * @param intervals the intervals to filter by, or null if all reads
	 *   are to be included (in which case traverseUnplacedUnmapped must be
	 *   true)
	 * @param traverseUnplacedUnmapped whether to included unplaced unampped reads
	 * @param  the {@link Locatable} type
	 */
	public static  void setTraversalParameters(Configuration conf,
			List intervals, boolean traverseUnplacedUnmapped) {
		if (intervals == null && !traverseUnplacedUnmapped) {
			throw new IllegalArgumentException("Traversing mapped reads only is not supported.");
		}
		conf.setBoolean(BOUNDED_TRAVERSAL_PROPERTY, true);
		if (intervals != null) {
			StringBuilder sb = new StringBuilder();
			for (Iterator it = intervals.iterator(); it.hasNext(); ) {
				Locatable l = it.next();
				sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd()));
				if (it.hasNext()) {
					sb.append(",");
				}
			}
			conf.set(INTERVALS_PROPERTY, sb.toString());
		}
		conf.setBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, traverseUnplacedUnmapped);
	}

	/**
	 * Reset traversal parameters so that all reads are included.
	 * @param conf the Hadoop configuration to set properties on
	 */
	public static void unsetTraversalParameters(Configuration conf) {
		conf.unset(BOUNDED_TRAVERSAL_PROPERTY);
		conf.unset(INTERVALS_PROPERTY);
		conf.unset(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY);
	}

	static boolean isBoundedTraversal(Configuration conf) {
		return conf.getBoolean(BOUNDED_TRAVERSAL_PROPERTY, false) ||
				conf.get(INTERVALS_PROPERTY) != null; // backwards compatibility
	}

	static boolean traverseUnplacedUnmapped(Configuration conf) {
		return conf.getBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, false);
	}

	static List getIntervals(Configuration conf) {
		return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY);
	}

	static Path getIdxPath(Path path) {
		return path.suffix(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
	}

	static List removeIndexFiles(List splits) {
		// Remove any splitting bai files
		return splits.stream()
				.filter(split -> !((FileSplit) split).getPath().getName().endsWith(
						SplittingBAMIndexer.OUTPUT_FILE_EXTENSION))
                                .filter(split -> !((FileSplit) split).getPath().getName().endsWith(
                                                BAMIndex.BAMIndexSuffix))
				.collect(Collectors.toList());
        }
    
    	static Path getBAIPath(Path path) {
		return path.suffix(BAMIndex.BAMIndexSuffix);
	}

	/** Returns a {@link BAMRecordReader} initialized with the parameters. */
	@Override public RecordReader
		createRecordReader(InputSplit split, TaskAttemptContext ctx)
            throws InterruptedException, IOException
	{
		final RecordReader rr =
			new BAMRecordReader();
		rr.initialize(split, ctx);
		return rr;
	}

	/** The splits returned are {@link FileVirtualSplit FileVirtualSplits}. */
	@Override public List getSplits(JobContext job)
		throws IOException
	{
		return getSplits(super.getSplits(job), job.getConfiguration());
	}

	public List getSplits(
			List splits, Configuration cfg)
		throws IOException
	{

		final List origSplits = removeIndexFiles(splits);

		// Align the splits so that they don't cross blocks.

		// addIndexedSplits() requires the given splits to be sorted by file
		// path, so do so. Although FileInputFormat.getSplits() does, at the time
		// of writing this, generate them in that order, we shouldn't rely on it.
		Collections.sort(origSplits, new Comparator() {
			public int compare(InputSplit a, InputSplit b) {
				FileSplit fa = (FileSplit)a, fb = (FileSplit)b;
				return fa.getPath().compareTo(fb.getPath());
			}
		});

		final List newSplits =
			new ArrayList(origSplits.size());

		for (int i = 0; i < origSplits.size();) {
			try {
				i = addIndexedSplits                        (origSplits, i, newSplits, cfg);
			} catch (IOException | ProviderNotFoundException e) {
				if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) {
					try {
						i = addBAISplits            (origSplits, i, newSplits, cfg);
					} catch (IOException | ProviderNotFoundException e2) {
						i = addProbabilisticSplits  (origSplits, i, newSplits, cfg);
					}
				} else {
					i = addProbabilisticSplits          (origSplits, i, newSplits, cfg);
				}
			}
		}
		return filterByInterval(newSplits, cfg);
	}

	// Handles all the splits that share the Path of the one at index i,
	// returning the next index to be used.
	private int addIndexedSplits(
			List splits, int i, List newSplits,
			Configuration cfg)
		throws IOException
	{
		final Path file = ((FileSplit)splits.get(i)).getPath();
		List potentialSplits = new ArrayList();

		final SplittingBAMIndex idx = new SplittingBAMIndex(
			file.getFileSystem(cfg).open(getIdxPath(file)));

		int splitsEnd = splits.size();
		for (int j = i; j < splitsEnd; ++j)
			if (!file.equals(((FileSplit)splits.get(j)).getPath()))
				splitsEnd = j;

		if (idx.size() == 1) { // no alignments, only the file size, so no splits to add
			return splitsEnd;
		}

		for (int j = i; j < splitsEnd; ++j) {
			final FileSplit fileSplit = (FileSplit)splits.get(j);

			final long start =         fileSplit.getStart();
			final long end   = start + fileSplit.getLength();

			final Long blockStart = idx.nextAlignment(start);

			// The last split needs to end where the last alignment ends, but the
			// index doesn't store that data (whoops); we only know where the last
			// alignment begins. Fortunately there's no need to change the index
			// format for this: we can just set the end to the maximal length of
			// the final BGZF block (0xffff), and then read until BAMRecordCodec
			// hits EOF.
			Long blockEnd;
			if (j == splitsEnd - 1) {
				blockEnd = idx.prevAlignment(end) | 0xffff;
			} else {
				blockEnd = idx.nextAlignment(end);
			}

			if (blockStart == null || blockEnd == null) {
				logger.warn("Index for {} was not good. Generating probabilistic splits.", file);
				return addProbabilisticSplits(splits, i, newSplits, cfg);
			}

			potentialSplits.add(new FileVirtualSplit(
						file, blockStart, blockEnd, fileSplit.getLocations()));
		}

		for (InputSplit s : potentialSplits) {
			newSplits.add(s);
		}
		return splitsEnd;
	}

	// Handles all the splits that share the Path of the one at index i,
	// returning the next index to be used.
        private int addBAISplits(List splits,
                                 int i,
                                 List newSplits,
                                 Configuration conf) throws IOException {
                final Path path = ((FileSplit)splits.get(i)).getPath();
                FileSystem fs = path.getFileSystem(conf);
                int splitsEnd = i;
                
                try (FSDataInputStream in = fs.open(path)) {
                        SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
                        SAMSequenceDictionary dict = header.getSequenceDictionary();
                        
                        final SeekableStream guesserSin =
                                WrapSeekable.openPath(fs, path);
                        final BAMSplitGuesser guesser = new BAMSplitGuesser(guesserSin, conf);

                        final SeekableStream sin;
                        if (fs.exists(getBAIPath(path))) {
                                sin = WrapSeekable.openPath(fs, getBAIPath(path));
                        } else {
                                sin = WrapSeekable.openPath(fs, new Path(path.toString()
                                                                         .replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix)));
                        }
                        final LinearBAMIndex idx = new LinearBAMIndex(sin, dict);

			// searches for the first contig that contains linear bins
			// a contig will have no linear bins if there are no reads mapped to that
			// contig (e.g., reads were aligned to a whole genome, and then reads from
			// only a single contig were selected)
                        int ctgIdx = -1;
                        int bin = 0;
                        LinearIndex linIdx;
                        int ctgBins;
                        long lastStart = 0;
                        do {
                                ctgIdx++;
                                linIdx = idx.getLinearIndex(ctgIdx);
                                ctgBins = linIdx.size();
                        } while(ctgBins == 0);
                        long nextStart = linIdx.get(bin);
                        
                        FileVirtualSplit newSplit = null;
                        boolean lastWasGuessed = false;

			// loop and process all of the splits that share a single .bai
                        while(splitsEnd < splits.size() &&
                              ((FileSplit)(splits.get(splitsEnd))).getPath() == path) {
                                FileSplit fSplit = (FileSplit)splits.get(splitsEnd);
                                splitsEnd++;
                                
                                if (splitsEnd >= splits.size()) {
                                        break;
                                }

                                long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16;
                                lastStart = nextStart;

				// we need to advance and find the first linear index bin
				// that starts after the current split ends.
				// this is the end of our split.
				while(nextStart < fSplitEnd && ctgIdx < dict.size()) {

					// are we going off of the end of this contig?
					// if so, advance to the next contig with a linear bin
					if (bin + 1 >= ctgBins) {
						do {
                                                        ctgIdx += 1;
                                                        bin = 0;
                                                        if (ctgIdx >= dict.size()) {
                                                                break;
                                                        }
                                                        linIdx = idx.getLinearIndex(ctgIdx);
                                                        ctgBins = linIdx.size();
                                                } while (ctgBins == 0);
                                        }
                                        if (ctgIdx < dict.size() && linIdx.size() > bin) {
                                                nextStart = linIdx.get(bin);
                                                bin++;
                                        }
                                }

				// is this the first split?
				// if so, split ranges from where the reads start until the identified end
                                if (fSplit.getStart() == 0) {
                                        final SeekableStream inFile =
                                                WrapSeekable.openPath(path.getFileSystem(conf), path);
                                        SamReader open = SamReaderFactory.makeDefault().setUseAsyncIo(false)
                                                .open(SamInputResource.of(inFile));
                                        SAMFileSpan span = open.indexing().getFilePointerSpanningReads();
                                        long bamStart = ((BAMFileSpan) span).getFirstOffset();
                                        newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                        bamStart,
                                                                        nextStart - 1,
                                                                        fSplit.getLocations());
                                        newSplits.add(newSplit);
                                } else {

					// did we find any blocks that started in the last split?
					// if yes, then we're fine
					// if no, then we need to guess a split start (in the else clause)
					if (lastStart != nextStart) {
                                                if (lastWasGuessed) {
                                                        newSplit.setEndVirtualOffset(lastStart - 1);
                                                        lastWasGuessed = false;
                                                }
                                                newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                                lastStart,
                                                                                nextStart - 1,
                                                                                fSplit.getLocations());
                                                newSplits.add(newSplit);
                                        } else {
						// guess the start
                                                long alignedBeg = guesser.guessNextBAMRecordStart(fSplit.getStart(),
                                                                                                  fSplit.getStart() + fSplit.getLength());
                                                newSplit.setEndVirtualOffset(alignedBeg - 1);
                                                lastStart = alignedBeg;
                                                nextStart = alignedBeg;
                                                newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                                alignedBeg,
                                                                                alignedBeg + 1,
                                                                                fSplit.getLocations());
                                                lastWasGuessed = true;
                                                newSplits.add(newSplit);
                                        }
                                }
                                lastStart = nextStart;
                        }
			// clean up the last split
                        if (splitsEnd == splits.size()) {
                                if (lastWasGuessed) {
                                        newSplit.setEndVirtualOffset(lastStart - 1);
                                        lastWasGuessed = false;
                                }
                                FileSplit fSplit = (FileSplit)splits.get(splitsEnd - 1);
                                long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16;
                                newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                lastStart,
                                                                fSplitEnd,
                                                                fSplit.getLocations());
                                newSplits.add(newSplit);
                        }
                }
                return splitsEnd + 1;
        }
        
	// Works the same way as addIndexedSplits, to avoid having to reopen the
	// file repeatedly and checking addIndexedSplits for an index repeatedly.
	private int addProbabilisticSplits(
			List splits, int i, List newSplits,
			Configuration cfg)
		throws IOException
	{
		final Path path = ((FileSplit)splits.get(i)).getPath();
		final SeekableStream sin =
			WrapSeekable.openPath(path.getFileSystem(cfg), path);

		final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

		FileVirtualSplit previousSplit = null;

		for (; i < splits.size(); ++i) {
			FileSplit fspl = (FileSplit)splits.get(i);
			if (!fspl.getPath().equals(path))
				break;

			long beg =       fspl.getStart();
			long end = beg + fspl.getLength();

			long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

			// As the guesser goes to the next BGZF block before looking for BAM
			// records, the ending BGZF blocks have to always be traversed fully.
			// Hence force the length to be 0xffff, the maximum possible.
			long alignedEnd = end << 16 | 0xffff;

			if (alignedBeg == end) {
				// No records detected in this split: merge it to the previous one.
				// This could legitimately happen e.g. if we have a split that is
				// so small that it only contains the middle part of a BGZF block.
				//
				// Of course, if it's the first split, then this is simply not a
				// valid BAM file.
				//
				// FIXME: In theory, any number of splits could only contain parts
				// of the BAM header before we start to see splits that contain BAM
				// records. For now, we require that the split size is at least as
				// big as the header and don't handle that case.
				if (previousSplit == null)
					throw new IOException("'" + path + "': "+
						"no reads in first split: bad BAM file or tiny split size?");

				previousSplit.setEndVirtualOffset(alignedEnd);
			} else {
				previousSplit = new FileVirtualSplit(
                                        path, alignedBeg, alignedEnd, fspl.getLocations());
				if (logger.isDebugEnabled()) {
					final long byteOffset  = alignedBeg >>> 16;
					final long recordOffset = alignedBeg & 0xffff;
					logger.debug(
						"Split {}: byte offset: {} record offset: {}, virtual offset: {}",
						i, byteOffset, recordOffset, alignedBeg);
				}
				newSplits.add(previousSplit);
			}
		}

		sin.close();
		return i;
	}

	private List filterByInterval(List splits, Configuration conf)
			throws IOException {
		if (!isBoundedTraversal(conf)) {
			return splits;
		}

		// Get the chunk lists (BAMFileSpans) in the intervals we want (chunks give start
		// and end file pointers into a BAM file) by looking in all the indexes for the BAM
		// files
		Set bamFiles = new LinkedHashSet<>();
		for (InputSplit split : splits) {
			bamFiles.add(((FileVirtualSplit) split).getPath());
		}
		Map fileToSpan = new LinkedHashMap<>();
		SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
				.setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true)
				.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
				.setUseAsyncIo(false);

		List intervals = getIntervals(conf);

		Map fileToUnmapped = new LinkedHashMap<>();
		boolean traverseUnplacedUnmapped = traverseUnplacedUnmapped(conf);

		for (Path bamFile : bamFiles) {
			FileSystem fs = bamFile.getFileSystem(conf);

			try (SamReader samReader =
							 readerFactory.open(NIOFileUtil.asPath(fs.makeQualified(bamFile).toUri()))) {
				if (!samReader.hasIndex()) {
					throw new IllegalArgumentException("Intervals set but no BAM index file found for " + bamFile);

				}

				try (FSDataInputStream in = fs.open(bamFile)) {
					SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
					SAMSequenceDictionary dict = header.getSequenceDictionary();
					BAMIndex idx = samReader.indexing().getIndex();

					if (intervals != null && !intervals.isEmpty()) {
						QueryInterval[] queryIntervals = prepareQueryIntervals(intervals, dict);
						fileToSpan.put(bamFile, BAMFileReader.getFileSpan(queryIntervals, idx));
					}

					if (traverseUnplacedUnmapped) {
						long startOfLastLinearBin = idx.getStartOfLastLinearBin();
						long noCoordinateCount = ((AbstractBAMFileIndex) idx).getNoCoordinateCount();
						if (startOfLastLinearBin != -1 && noCoordinateCount > 0) {
							// add FileVirtualSplit (with no intervals) from startOfLastLinearBin to
							// end of file
							fileToUnmapped.put(bamFile, startOfLastLinearBin);
						}
					}
				}

			}
		}

		// Use the chunks to filter the splits
		List filteredSplits = new ArrayList<>();
		for (InputSplit split : splits) {
			FileVirtualSplit virtualSplit = (FileVirtualSplit) split;
			long splitStart = virtualSplit.getStartVirtualOffset();
			long splitEnd = virtualSplit.getEndVirtualOffset();
			BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(splitStart, splitEnd));
			BAMFileSpan span = fileToSpan.get(virtualSplit.getPath());
			if (span == null) {
				continue;
			}
			span = (BAMFileSpan) span.removeContentsBefore(splitSpan);
			span = (BAMFileSpan) span.removeContentsAfter(splitSpan);
			if (!span.getChunks().isEmpty()) {
				filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), splitStart, splitEnd,
						virtualSplit.getLocations(), span.toCoordinateArray()));
			}
		}

		if (traverseUnplacedUnmapped) {
			// add extra splits that contain only unmapped reads
			for (Map.Entry e : fileToUnmapped.entrySet()) {
				Path file = e.getKey();
				long unmappedStart = e.getValue();
				boolean foundFirstSplit = false;
				for (InputSplit split : splits) { // TODO: are splits in order of start position?
					FileVirtualSplit virtualSplit = (FileVirtualSplit) split;
					if (virtualSplit.getPath().equals(file)) {
						long splitStart = virtualSplit.getStartVirtualOffset();
						long splitEnd = virtualSplit.getEndVirtualOffset();
						if (foundFirstSplit) {
							filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), splitStart, splitEnd,
									virtualSplit.getLocations()));
						} else if (splitStart <= unmappedStart && unmappedStart <= splitEnd) {
							filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), unmappedStart, splitEnd,
									virtualSplit.getLocations()));
							foundFirstSplit = true;
						}
					}
				}
			}
		}

		return filteredSplits;
	}

	/**
	 * Converts a List of SimpleIntervals into the format required by the SamReader query API
	 * @param rawIntervals SimpleIntervals to be converted
	 * @return A sorted, merged list of QueryIntervals suitable for passing to the SamReader query API
	 */
	static QueryInterval[] prepareQueryIntervals( final List
			rawIntervals, final SAMSequenceDictionary sequenceDictionary ) {
		if ( rawIntervals == null || rawIntervals.isEmpty() ) {
			return null;
		}

		// Convert each SimpleInterval to a QueryInterval
		final QueryInterval[] convertedIntervals =
				rawIntervals.stream()
						.map(rawInterval -> convertSimpleIntervalToQueryInterval(rawInterval, sequenceDictionary))
						.toArray(QueryInterval[]::new);

		// Intervals must be optimized (sorted and merged) in order to use the htsjdk query API
		return QueryInterval.optimizeIntervals(convertedIntervals);
	}
	/**
	 * Converts an interval in SimpleInterval format into an htsjdk QueryInterval.
	 *
	 * In doing so, a header lookup is performed to convert from contig name to index
	 *
	 * @param interval interval to convert
	 * @param sequenceDictionary sequence dictionary used to perform the conversion
	 * @return an equivalent interval in QueryInterval format
	 */
	private static QueryInterval convertSimpleIntervalToQueryInterval( final Interval interval,	final SAMSequenceDictionary sequenceDictionary ) {
		if (interval == null) {
			throw new IllegalArgumentException("interval may not be null");
		}
		if (sequenceDictionary == null) {
			throw new IllegalArgumentException("sequence dictionary may not be null");
		}

		final int contigIndex = sequenceDictionary.getSequenceIndex(interval.getContig());
		if ( contigIndex == -1 ) {
			throw new IllegalArgumentException("Contig " + interval.getContig() + " not present in reads sequence " +
					"dictionary");
		}

		return new QueryInterval(contigIndex, interval.getStart(), interval.getEnd());
	}

	@Override public boolean isSplitable(JobContext job, Path path) {
		return true;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy