org.seqdoop.hadoop_bam.FastaInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2012 Aalto University
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
package org.seqdoop.hadoop_bam;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Reads the FASTA reference sequence format.
* Key: sequence description and position offset, delimited by ':' characters.
* Value: a ReferenceFragment object representing the entry.
*
* Note: here sections in the input file are assumed to be delimited by single
* line descriptions that start with '>'.
*/
public class FastaInputFormat extends FileInputFormat
{
private static final Logger logger = LoggerFactory.getLogger(FastaInputFormat.class);
public static final Charset UTF8 = Charset.forName("UTF8");
@Override public List getSplits(JobContext job) throws IOException
{
// Note: We generate splits that correspond to different sections in the FASTA
// input (which here are called "chromosomes", delimited by '>' and
// followed by a single line description.
// Some locality is preserved since the locations are formed from the input
// splits, although no special attention is given to this issues (FASTA files
// are assumed to be smallish).
// The splits are generated on the client. In the future the split generation
// should be only performed once and an index file stored inside HDFS for
// peformance reasons. Currently this is not attempted (again: FASTA files
// aren't all that big).
// we first make sure we are given only a single file
List splits = super.getSplits(job);
// first sort by input path
Collections.sort(splits, new Comparator()
{
public int compare(InputSplit a, InputSplit b) {
FileSplit fa = (FileSplit)a, fb = (FileSplit)b;
return fa.getPath().compareTo(fb.getPath());
}
});
for (int i = 0; i < splits.size()-1; i++) {
FileSplit fa = (FileSplit)splits.get(i);
FileSplit fb = (FileSplit)splits.get(i+1);
if(fa.getPath().compareTo(fb.getPath()) != 0)
throw new IOException("FastaInputFormat assumes single FASTA input file!");
}
// now we are sure we only have one FASTA input file
final List newSplits = new ArrayList(splits.size());
FileSplit fileSplit = (FileSplit)splits.get(0);
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
FSDataInputStream fis = fs.open(path);
byte[] buffer = new byte[1024];
long byte_counter = 0;
long prev_chromosome_byte_offset = 0;
boolean first_chromosome = true;
for(int j = 0; j < splits.size(); j++) {
FileSplit origsplit = (FileSplit)splits.get(j);
while(byte_counter < origsplit.getStart()+origsplit.getLength()) {
long bytes_read = fis.read(byte_counter, buffer, 0, (int)Math.min(buffer.length,
origsplit.getStart()+origsplit.getLength()- byte_counter));
if (logger.isDebugEnabled()) {
logger.debug("bytes_read: {} of {} splits", bytes_read, splits.size());
}
if(bytes_read > 0) {
for(int i=0;i') {
if (logger.isDebugEnabled()) {
logger.debug("found chromosome at position {}", byte_counter + i);
}
if(!first_chromosome) {
FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter + i-1 - prev_chromosome_byte_offset, origsplit.getLocations());
if (logger.isDebugEnabled()) {
logger.debug("adding split: start: {}, length: {}", fsplit.getStart(), fsplit.getLength());
}
newSplits.add(fsplit);
}
first_chromosome = false;
prev_chromosome_byte_offset = byte_counter + i;
}
}
byte_counter += bytes_read;
}
}
if(j == splits.size()-1) {
FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter - prev_chromosome_byte_offset, origsplit.getLocations());
newSplits.add(fsplit);
if (logger.isDebugEnabled()) {
logger.debug("adding split: {}", fsplit);
}
break;
}
}
return newSplits;
}
public static class FastaRecordReader extends RecordReader
{
// start: first valid data index
private long start;
// end: first index value beyond the slice, i.e. slice is in range [start,end)
private long end;
// pos: current position in file
private long pos;
// file: the file being read
private Path file;
// current_split_pos: the current (chromosome) position within the split
private int current_split_pos;
// current_split_indexseq: the description/chromosome name
private String current_split_indexseq = null;
private LineReader lineReader;
private InputStream inputStream;
private Text currentKey = new Text();
private ReferenceFragment currentValue = new ReferenceFragment();
private Text buffer = new Text();
// How long can a FASTA line get?
public static final int MAX_LINE_LENGTH = 20000;
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
{
setConf(conf);
file = split.getPath();
start = split.getStart();
end = start + split.getLength();
current_split_pos = 1;
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream fileIn = fs.open(file);
CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
CompressionCodec codec = codecFactory.getCodec(file);
if (codec == null) // no codec. Uncompressed file.
{
positionAtFirstRecord(fileIn);
inputStream = fileIn;
}
else
{ // compressed file
if (start != 0)
throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
inputStream = codec.createInputStream(fileIn);
end = Long.MAX_VALUE; // read until the end of the file
}
lineReader = new LineReader(inputStream);
}
/*
* Position the input stream at the start of the first record.
*/
private void positionAtFirstRecord(FSDataInputStream stream) throws IOException
{
if (start > 0)
{
stream.seek(start);
}
// we are now in a new chromosome/fragment, so read its name/index sequence
// and reset position counter
// index sequence
LineReader reader = new LineReader(stream);
int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
current_split_indexseq = buffer.toString();
// now get rid of '>' character
current_split_indexseq = current_split_indexseq.substring(1,current_split_indexseq.length());
// initialize position counter
current_split_pos = 1;
if (logger.isDebugEnabled()) {
logger.debug("read index sequence: {}", current_split_indexseq);
}
start = start + bytesRead;
stream.seek(start);
pos = start;
}
protected void setConf(Configuration conf)
{
}
/**
* Added to use mapreduce API.
*/
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException
{
}
/**
* Added to use mapreduce API.
*/
public Text getCurrentKey()
{
return currentKey;
}
/**
* Added to use mapreduce API.
*/
public ReferenceFragment getCurrentValue()
{
return currentValue;
}
/**
* Added to use mapreduce API.
*/
public boolean nextKeyValue() throws IOException, InterruptedException
{
return next(currentKey, currentValue);
}
/**
* Close this RecordReader to future operations.
*/
public void close() throws IOException
{
inputStream.close();
}
/**
* Create an object of the appropriate type to be used as a key.
*/
public Text createKey()
{
return new Text();
}
/**
* Create an object of the appropriate type to be used as a value.
*/
public ReferenceFragment createValue()
{
return new ReferenceFragment();
}
/**
* Returns the current position in the input.
*/
public long getPos() { return pos; }
/**
* How much of the input has the RecordReader consumed i.e.
*/
public float getProgress()
{
if (start == end)
return 1.0f;
else
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
public String makePositionMessage(long pos)
{
return file.toString() + ":" + pos;
}
public String makePositionMessage()
{
return file.toString() + ":" + pos;
}
/**
* Reads the next key/value pair from the input for processing.
*/
public boolean next(Text key, ReferenceFragment value) throws IOException
{
if (pos >= end)
return false; // past end of slice
int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH);
pos += bytesRead;
if (bytesRead >= MAX_LINE_LENGTH)
throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + Text.decode(buffer.getBytes(), 0, 500));
else if (bytesRead <= 0)
return false; // EOF
else
{
scanFastaLine(buffer, key, value);
current_split_pos += bytesRead;
return true;
}
}
private void scanFastaLine(Text line, Text key, ReferenceFragment fragment)
{
// Build the key. We concatenate the chromosome/fragment descripion and
// the start position of the FASTA sequence line, replacing the tabs with colons.
key.clear();
key.append(current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length);
key.append(Integer.toString(current_split_pos).getBytes(UTF8), 0, Integer.toString(current_split_pos).getBytes(UTF8).length);
// replace tabs with :
byte[] bytes = key.getBytes();
int temporaryEnd = key.getLength();
for (int i = 0; i < temporaryEnd; ++i)
if (bytes[i] == '\t')
bytes[i] = ':';
fragment.clear();
fragment.setPosition(current_split_pos);
fragment.setIndexSequence(current_split_indexseq);
fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length);
}
}
@Override
public boolean isSplitable(JobContext context, Path path)
{
CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path);
return codec == null;
}
public RecordReader createRecordReader(
InputSplit genericSplit,
TaskAttemptContext context) throws IOException, InterruptedException
{
context.setStatus(genericSplit.toString());
return new FastaRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat
}
}