org.seqdoop.hadoop_bam.CRAMInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
There is a newer version: 7.10.0
package org.seqdoop.hadoop_bam;

import htsjdk.samtools.cram.build.CramContainerIterator;
import htsjdk.samtools.seekablestream.SeekableStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.seqdoop.hadoop_bam.util.WrapSeekable;

public class CRAMInputFormat extends FileInputFormat {

  public static final String REFERENCE_SOURCE_PATH_PROPERTY =
      "hadoopbam.cram.reference-source-path";

  @Override
  public List getSplits(JobContext job) throws IOException {
    return getSplits(super.getSplits(job), job.getConfiguration());
  }

  public List getSplits(List splits, Configuration conf)
      throws IOException {
    // update splits to align with CRAM container boundaries
    List newSplits = new ArrayList();
    Map> fileToOffsets = new HashMap>();
    for (InputSplit split : splits) {
      FileSplit fileSplit = (FileSplit) split;
      Path path = fileSplit.getPath();
      List containerOffsets = fileToOffsets.get(path);
      if (containerOffsets == null) {
        containerOffsets = getContainerOffsets(conf, path);
        fileToOffsets.put(path, containerOffsets);
      }
      long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart());
      long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() +
          fileSplit.getLength());
      long newLength = newEnd - newStart;
      if (newLength == 0) { // split is wholly within a container
        continue;
      }
      FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength,
          fileSplit.getLocations());
      newSplits.add(newSplit);
    }
    return newSplits;
  }

  private static List getContainerOffsets(Configuration conf, Path cramFile)
      throws IOException {
    SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile);
    CramContainerIterator cci = new CramContainerIterator(seekableStream);
    List containerOffsets = new ArrayList();
    containerOffsets.add(seekableStream.position());
    while (cci.hasNext()) {
      cci.next();
      containerOffsets.add(seekableStream.position());
    }
    containerOffsets.add(seekableStream.length());
    return containerOffsets;
  }

  private static long nextContainerOffset(List containerOffsets, long position) {
    for (long offset : containerOffsets) {
      if (offset >= position) {
        return offset;
      }
    }
    throw new IllegalStateException("Could not find position " + position + " in " +
        "container offsets: " + containerOffsets);
  }

  @Override
  public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    RecordReader rr = new CRAMRecordReader();
    rr.initialize(split, context);
    return rr;
  }

  @Override
  public boolean isSplitable(JobContext job, Path path) {
    return true;
  }
}