org.seqdoop.hadoop_bam.FastaInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
There is a newer version: 7.10.0
// Copyright (c) 2012 Aalto University
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

package org.seqdoop.hadoop_bam;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Reads the FASTA reference sequence format.
 * Key: sequence description and position offset, delimited by ':' characters.
 * Value:  a ReferenceFragment object representing the entry.
 *
 * Note: here sections in the input file are assumed to be delimited by single
 * line descriptions that start with '>'.
 */
public class FastaInputFormat extends FileInputFormat
{
	private static final Logger logger = LoggerFactory.getLogger(FastaInputFormat.class);
	public static final Charset UTF8 = Charset.forName("UTF8");

    @Override public List getSplits(JobContext job) throws IOException
	{

	    // Note: We generate splits that correspond to different sections in the FASTA
	    // input (which here are called "chromosomes", delimited by '>' and
	    // followed by a single line description.
	    // Some locality is preserved since the locations are formed from the input
	    // splits, although no special attention is given to this issues (FASTA files
	    // are assumed to be smallish).
	    // The splits are generated on the client. In the future the split generation
	    // should be only performed once and an index file stored inside HDFS for
	    // peformance reasons. Currently this is not attempted (again: FASTA files
	    // aren't all that big).

	    // we first make sure we are given only a single file

            List splits = super.getSplits(job);
            
            // first sort by input path
            Collections.sort(splits, new Comparator()
                             {
                                 public int compare(InputSplit a, InputSplit b) {
                                     FileSplit fa = (FileSplit)a, fb = (FileSplit)b;
                                     return fa.getPath().compareTo(fb.getPath());
                                 }
                             });

            for (int i = 0; i < splits.size()-1; i++) {
                FileSplit fa = (FileSplit)splits.get(i);
                FileSplit fb = (FileSplit)splits.get(i+1);
                    
                if(fa.getPath().compareTo(fb.getPath()) != 0)
                    throw new IOException("FastaInputFormat assumes single FASTA input file!");
            }

            // now we are sure we only have one FASTA input file

	    final List newSplits = new ArrayList(splits.size());
	    FileSplit fileSplit = (FileSplit)splits.get(0);
	    Path path = fileSplit.getPath();

	    FileSystem fs = path.getFileSystem(job.getConfiguration());
	    FSDataInputStream fis = fs.open(path);
	    byte[] buffer = new byte[1024];

	    long byte_counter = 0;
	    long prev_chromosome_byte_offset = 0;
	    boolean first_chromosome = true;

	    for(int j = 0; j < splits.size(); j++) {
		FileSplit origsplit = (FileSplit)splits.get(j);

		while(byte_counter < origsplit.getStart()+origsplit.getLength()) {
		    long bytes_read = fis.read(byte_counter, buffer, 0, (int)Math.min(buffer.length,
										      origsplit.getStart()+origsplit.getLength()- byte_counter));
		    if (logger.isDebugEnabled()) {
			logger.debug("bytes_read: {} of {} splits", bytes_read, splits.size());
		    }
		    if(bytes_read > 0) {
			for(int i=0;i') {
				if (logger.isDebugEnabled()) {
					logger.debug("found chromosome at position {}", byte_counter + i);
				}
				
				if(!first_chromosome) {
				    FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter + i-1 - prev_chromosome_byte_offset, origsplit.getLocations());

					if (logger.isDebugEnabled()) {
						logger.debug("adding split: start: {}, length: {}", fsplit.getStart(), fsplit.getLength());
					}
				    newSplits.add(fsplit);
				}
				first_chromosome = false;
				prev_chromosome_byte_offset = byte_counter + i;
			    }
			}
			byte_counter += bytes_read;
		    }
		}

		if(j == splits.size()-1) {
		    FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter - prev_chromosome_byte_offset, origsplit.getLocations());
		    newSplits.add(fsplit);
			if (logger.isDebugEnabled()) {
				logger.debug("adding split: {}", fsplit);
			}
		    break;
		}
	    }
	    
	    return newSplits;
	}

	public static class FastaRecordReader extends RecordReader
	{
		
		// start:  first valid data index
		private long start;
		// end:  first index value beyond the slice, i.e. slice is in range [start,end)
		private long end;
		// pos: current position in file
		private long pos;
		// file:  the file being read
		private Path file;

		// current_split_pos: the current (chromosome) position within the split
		private int current_split_pos;
		// current_split_indexseq: the description/chromosome name
		private String current_split_indexseq = null;

		private LineReader lineReader;
		private InputStream inputStream;
		private Text currentKey = new Text();
		private ReferenceFragment currentValue = new ReferenceFragment();

		private Text buffer = new Text();

		// How long can a FASTA line get?
		public static final int MAX_LINE_LENGTH = 20000;

		public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
		{
			setConf(conf);
			file = split.getPath();
			start = split.getStart();
			end = start + split.getLength();
			current_split_pos = 1;

			FileSystem fs = file.getFileSystem(conf);
			FSDataInputStream fileIn = fs.open(file);

			CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
			CompressionCodec        codec        = codecFactory.getCodec(file);

			if (codec == null) // no codec.  Uncompressed file.
			{
				positionAtFirstRecord(fileIn);
				inputStream = fileIn;
			}
			else
			{ // compressed file
				if (start != 0)
					throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

				inputStream = codec.createInputStream(fileIn);
				end = Long.MAX_VALUE; // read until the end of the file
			}

			lineReader = new LineReader(inputStream);
		}

		/*
		 * Position the input stream at the start of the first record.
		 */
		private void positionAtFirstRecord(FSDataInputStream stream) throws IOException
		{
		    if (start > 0)
			{
			    stream.seek(start);
			}

		    // we are now in a new chromosome/fragment, so read its name/index sequence
		    // and reset position counter

		    // index sequence
		    LineReader reader = new LineReader(stream);
		    int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));

		    current_split_indexseq = buffer.toString();
		    // now get rid of '>' character
		    current_split_indexseq = current_split_indexseq.substring(1,current_split_indexseq.length());
		    
		    // initialize position counter
		    current_split_pos = 1;

			if (logger.isDebugEnabled()) {
				logger.debug("read index sequence: {}", current_split_indexseq);
			}
		    start = start + bytesRead;
		    stream.seek(start);
		    pos = start;
		}

		protected void setConf(Configuration conf)
		{
		}

		/**
		 * Added to use mapreduce API.
		 */
		public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException
		{
		}

		/**
		 * Added to use mapreduce API.
		 */
		public Text getCurrentKey()
		{
			return currentKey;
		}

		/**
		 * Added to use mapreduce API.
		 */
		public ReferenceFragment getCurrentValue()
	 	{
			return currentValue;
		}

		/**
		 * Added to use mapreduce API.
		 */
		public boolean nextKeyValue() throws IOException, InterruptedException
		{
			return next(currentKey, currentValue);
		}

		/**
		 * Close this RecordReader to future operations.
		 */
		public void close() throws IOException
		{
			inputStream.close();
		}

		/**
		 * Create an object of the appropriate type to be used as a key.
		 */
		public Text createKey()
		{
			return new Text();
		}

		/**
		 * Create an object of the appropriate type to be used as a value.
		 */
		public ReferenceFragment createValue()
		{
			return new ReferenceFragment();
		}

		/**
		 * Returns the current position in the input.
		 */
		public long getPos() { return pos; }

		/**
		 * How much of the input has the RecordReader consumed i.e.
		 */
		public float getProgress()
		{
			if (start == end)
				return 1.0f;
			else
				return Math.min(1.0f, (pos - start) / (float)(end - start));
		}

		public String makePositionMessage(long pos)
		{
			return file.toString() + ":" + pos;
		}

		public String makePositionMessage()
		{
			return file.toString() + ":" + pos;
		}

		/**
		 * Reads the next key/value pair from the input for processing.
		 */
		public boolean next(Text key, ReferenceFragment value) throws IOException
		{
			if (pos >= end)
				return false; // past end of slice

			int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH);
			pos += bytesRead;
			if (bytesRead >= MAX_LINE_LENGTH)
				throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + Text.decode(buffer.getBytes(), 0, 500));
			else if (bytesRead <= 0)
				return false; // EOF
			else
			{
				scanFastaLine(buffer, key, value);
				current_split_pos += bytesRead;
				return true;
			}
		}

		private void scanFastaLine(Text line, Text key, ReferenceFragment fragment)
		{
		    // Build the key.  We concatenate the chromosome/fragment descripion and
		    // the start position of the FASTA sequence line, replacing the tabs with colons.
		    key.clear();
		    
		    key.append(current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length);
		    key.append(Integer.toString(current_split_pos).getBytes(UTF8), 0, Integer.toString(current_split_pos).getBytes(UTF8).length);
		    // replace tabs with :
		    byte[] bytes = key.getBytes();
		    int temporaryEnd = key.getLength();
		    for (int i = 0; i < temporaryEnd; ++i)
			if (bytes[i] == '\t')
			    bytes[i] = ':';
		    
		    fragment.clear();
		    fragment.setPosition(current_split_pos);
		    fragment.setIndexSequence(current_split_indexseq);
		    fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length);
		}
	}

	@Override
	public boolean isSplitable(JobContext context, Path path)
	{
		CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path);
		return codec == null;
	}

	public RecordReader createRecordReader(
	                                        InputSplit genericSplit,
	                                        TaskAttemptContext context) throws IOException, InterruptedException
	{
		context.setStatus(genericSplit.toString());
		return new FastaRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat
	}
}