org.seqdoop.hadoop_bam.QseqInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
There is a newer version: 7.10.0
// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

package org.seqdoop.hadoop_bam;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.CharacterCodingException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;
import org.seqdoop.hadoop_bam.util.ConfHelper;

/**
 * Reads the Illumina qseq sequence format.
 * Key: instrument, run number, lane, tile, xpos, ypos, read number, delimited by ':' characters.
 * Value:  a SequencedFragment object representing the entry.
 */
public class QseqInputFormat extends FileInputFormat
{
	public static final String CONF_BASE_QUALITY_ENCODING = "hbam.qseq-input.base-quality-encoding";
	public static final String CONF_FILTER_FAILED_QC      = "hbam.qseq-input.filter-failed-qc";
	public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "illumina";

	public static class QseqRecordReader extends RecordReader
	{
		/*
		 * qseq format:
		 * 11 tab-separated columns
		 *
		 * 1) Instrument
		 * 2) Run id
		 * 3) Lane number
		 * 4) Tile number
		 * 5) X pos
		 * 6) Y pos
		 * 7) Index sequence (0 for runs without multiplexing)
		 * 8) Read Number
		 * 9) Base Sequence
		 * 10) Base Quality
		 * 11) Filter: did the read pass filtering? 0 - No, 1 - Yes.
		 */
		// start:  first valid data index
		private long start;
		// end:  first index value beyond the slice, i.e. slice is in range [start,end)
		private long end;
		// pos: current position in file
		private long pos;
		// file:  the file being read
		private Path file;

		private LineReader lineReader;
		private InputStream inputStream;
		private Text currentKey = new Text();
		private SequencedFragment currentValue = new SequencedFragment();

		private Text buffer = new Text();
		private static final int NUM_QSEQ_COLS = 11;
		// for these, we have one per qseq field
		private int[] fieldPositions = new int[NUM_QSEQ_COLS];
		private int[] fieldLengths = new int[NUM_QSEQ_COLS];

		private BaseQualityEncoding qualityEncoding;
		private boolean filterFailedQC = false;

		private static final String Delim = "\t";

		// How long can a qseq line get?
		public static final int MAX_LINE_LENGTH = 20000;

		public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
		{
			setConf(conf);
			file = split.getPath();
			start = split.getStart();
			end = start + split.getLength();

			FileSystem fs = file.getFileSystem(conf);
			FSDataInputStream fileIn = fs.open(file);

			CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
			CompressionCodec        codec        = codecFactory.getCodec(file);

			if (codec == null) // no codec.  Uncompressed file.
			{
				positionAtFirstRecord(fileIn);
				inputStream = fileIn;
			}
			else
			{ // compressed file
				if (start != 0)
					throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

				inputStream = codec.createInputStream(fileIn);
				end = Long.MAX_VALUE; // read until the end of the file
			}

			lineReader = new LineReader(inputStream);
		}

		/*
		 * Position the input stream at the start of the first record.
		 */
		private void positionAtFirstRecord(FSDataInputStream stream) throws IOException
		{
			if (start > 0)
			{
				// Advance to the start of the first line in our slice.
				// We use a temporary LineReader to read a partial line and find the
				// start of the first one on or after our starting position.
				// In case our slice starts right at the beginning of a line, we need to back
				// up by one position and then discard the first line.
				start -= 1;
				stream.seek(start);
				LineReader reader = new LineReader(stream);
				int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
				start = start + bytesRead;
				stream.seek(start);
			}
			// else
			//	if start == 0 we're starting at the beginning of a line
			pos = start;
		}

		protected void setConf(Configuration conf)
		{
			String encoding =
			  conf.get(QseqInputFormat.CONF_BASE_QUALITY_ENCODING,
			    conf.get(FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING,
			      CONF_BASE_QUALITY_ENCODING_DEFAULT));

			if ("illumina".equals(encoding))
				qualityEncoding = BaseQualityEncoding.Illumina;
			else if ("sanger".equals(encoding))
				qualityEncoding = BaseQualityEncoding.Sanger;
			else
				throw new RuntimeException("Unknown input base quality encoding value " + encoding);

			filterFailedQC = ConfHelper.parseBoolean(
			  conf.get(QseqInputFormat.CONF_FILTER_FAILED_QC,
			    conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)),
			      false);
		}

		/**
		 * Added to use mapreduce API.
		 */
		public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException
		{
		}

		/**
		 * Added to use mapreduce API.
		 */
		public Text getCurrentKey()
		{
			return currentKey;
		}

		/**
		 * Added to use mapreduce API.
		 */
		public SequencedFragment getCurrentValue()
	 	{
			return currentValue;
		}

		/**
		 * Added to use mapreduce API.
		 */
		public boolean nextKeyValue() throws IOException, InterruptedException
		{
			return next(currentKey, currentValue);
		}

		/**
		 * Close this RecordReader to future operations.
		 */
		public void close() throws IOException
		{
			inputStream.close();
		}

		/**
		 * Create an object of the appropriate type to be used as a key.
		 */
		public Text createKey()
		{
			return new Text();
		}

		/**
		 * Create an object of the appropriate type to be used as a value.
		 */
		public SequencedFragment createValue()
		{
			return new SequencedFragment();
		}

		/**
		 * Returns the current position in the input.
		 */
		public long getPos() { return pos; }

		/**
		 * How much of the input has the RecordReader consumed i.e.
		 */
		public float getProgress()
		{
			if (start == end)
				return 1.0f;
			else
				return Math.min(1.0f, (pos - start) / (float)(end - start));
		}

		public String makePositionMessage(long pos)
		{
			return file.toString() + ":" + pos;
		}

		public String makePositionMessage()
		{
			return file.toString() + ":" + pos;
		}

		/*
		 * Read a single record.
		 *
		 * Reads a single line of input and scans it with scanQseqLine, which
		 * sets key and value accordingly.  The method updates this.pos.
		 *
		 * @return The number of bytes read.  If no bytes were read, the EOF was reached.
		 */
		private int lowLevelQseqRead(Text key, SequencedFragment value) throws IOException
		{
			int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH);
			pos += bytesRead;
			if (bytesRead >= MAX_LINE_LENGTH)
			{
				String line;
				try {
					line = Text.decode(buffer.getBytes(), 0, 500);
				} catch (java.nio.charset.CharacterCodingException e) {
					line = "(line not convertible to printable format)";
				}
				throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " +
				            makePositionMessage(pos - bytesRead) + ": " + line);
			}
			else if (bytesRead > 0)
				scanQseqLine(buffer, key, value);

			return bytesRead;
		}

		/**
		 * Reads the next key/value pair from the input for processing.
		 */
		public boolean next(Text key, SequencedFragment value) throws IOException
		{
			if (pos >= end)
				return false; // past end of slice

			int bytesRead = 0;
			boolean goodRecord;
			do {
				bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached
				goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed());
			} while (bytesRead > 0 && !goodRecord);

			if (goodRecord) // post process the record only if it's going to be used
			{
				try {
					postProcessSequencedFragment(value);
				} catch (FormatException e) {
					throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) +
					            "; line: " + buffer); // last line read is still in the buffer
				}
			}

			return goodRecord;
		}

		/*
		 * Scans the text line to find the position and the lengths of the fields
		 * within it. The positions and lengths are saved into the instance arrays
		 * 'fieldPositions' and 'fieldLengths'.
		 *
		 * @exception FormatException Line doesn't have the expected number of fields.
		 */
		private void setFieldPositionsAndLengths(Text line)
		{
			int pos = 0; // the byte position within the record
			int fieldno = 0; // the field index within the record
			while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field
			{
				int endpos = line.find(Delim, pos); // the field's end position
				if (endpos < 0)
					endpos = line.getLength();

				fieldPositions[fieldno] = pos;
				fieldLengths[fieldno] = endpos - pos;

				pos = endpos + 1; // the next starting position is the current end + 1
				fieldno += 1;
			}

			if (fieldno != NUM_QSEQ_COLS)
				throw new FormatException("found " + fieldno + " fields instead of 11 at " +
				            makePositionMessage(this.pos - line.getLength()) + ". Line: " + line);
		}

		private void scanQseqLine(Text line, Text key, SequencedFragment fragment)
		{
			setFieldPositionsAndLengths(line);

			// Build the key.  We concatenate all fields from 0 to 5 (machine to y-pos)
			// and then the read number, replacing the tabs with colons.
			key.clear();
			// append up and including field[5]
			key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]);
			// replace tabs with :
			byte[] bytes = key.getBytes();
			int temporaryEnd = key.getLength();
			for (int i = 0; i < temporaryEnd; ++i)
				if (bytes[i] == '\t')
					bytes[i] = ':';
			// append the read number
			key.append(line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab.
			// convert the tab preceding the read number into a :
			key.getBytes()[temporaryEnd] = ':';

			// now the fragment
			try
			{
				fragment.clear();
				fragment.setInstrument( Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]) );
				fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])) );
				//fragment.setFlowcellId();
				fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])) );
				fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])) );
				fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])) );
				fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])) );
				fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])) );
				fragment.setFilterPassed( line.getBytes()[fieldPositions[10]] != '0' );
				//fragment.setControlNumber();
				if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence
					fragment.setIndexSequence(null);
				else
					fragment.setIndexSequence(Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N'));
			}
			catch (CharacterCodingException e) {
				throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
			}

			fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]);
			fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]);
		}

		/*
		 * This method applies some transformations to the read and quality data.
		 *
		 * 
		 *   '.' in the read are converted to 'N'
		 *   the base quality encoding is converted to 'sanger', unless otherwise
		 *   requested by the configuration.
		 * 
		 *
		 * @exception FormatException Thrown if the record contains base quality scores
		 * outside the range allowed by the format.
		 */
		private void postProcessSequencedFragment(SequencedFragment fragment)
		{
			byte[] bytes = fragment.getSequence().getBytes();
			// replace . with N
			for (int i = 0; i < fieldLengths[8]; ++i)
				if (bytes[i] == '.')
					bytes[i] = 'N';

			if (qualityEncoding == BaseQualityEncoding.Illumina)
			{
				// convert illumina to sanger scale
				SequencedFragment.convertQuality(fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger);
			}
			else // sanger qualities.
			{
				int outOfRangeElement = SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger);
				if (outOfRangeElement >= 0)
				{
					throw new FormatException("qseq base quality score out of range for Sanger Phred+33 format (found " +
					    (fragment.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" +
					    "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n");
				}
			}
		}
	}

	@Override
	public boolean isSplitable(JobContext context, Path path)
	{
		CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path);
		return codec == null;
	}

	public RecordReader createRecordReader(
	                                        InputSplit genericSplit,
	                                        TaskAttemptContext context) throws IOException, InterruptedException
	{
		context.setStatus(genericSplit.toString());
		return new QseqRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat
	}
}