org.seqdoop.hadoop_bam.FastqInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
package org.seqdoop.hadoop_bam;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.InputStream;
import java.io.IOException;
import java.io.EOFException;
import java.util.regex.*;
import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;
import org.seqdoop.hadoop_bam.util.ConfHelper;
public class FastqInputFormat extends FileInputFormat
{
public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-input.base-quality-encoding";
public static final String CONF_FILTER_FAILED_QC = "hbam.fastq-input.filter-failed-qc";
public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger";
public static class FastqRecordReader extends RecordReader
{
/*
* fastq format:
* := +
* := @\n\n+[]\n\n
* := [A-Za-z0-9_.:-]+
* := [A-Za-z\n\.~]+
* := [!-~\n]+
*
* LP: this format is broken, no? You can have multi-line sequence and quality strings,
* and the quality encoding includes '@' in its valid character range. So how should one
* distinguish between \n@ as a record delimiter and and \n@ as part of a multi-line
* quality string?
*
* For now I'm going to assume single-line sequences. This works for our sequencing
* application. We'll see if someone complains in other applications.
*/
// start: first valid data index
private long start;
// end: first index value beyond the slice, i.e. slice is in range [start,end)
private long end;
// pos: current position in file
private long pos;
// file: the file being read
private Path file;
private LineReader lineReader;
private InputStream inputStream;
private Text currentKey = new Text();
private SequencedFragment currentValue = new SequencedFragment();
/* If true, will scan the identifier for read data as specified in the Casava
* users' guide v1.8:
* @:::::: :::
* After the first name that doesn't match lookForIlluminaIdentifier will be
* set to false and no further scanning will be done.
*/
private boolean lookForIlluminaIdentifier = true;
private static final Pattern ILLUMINA_PATTERN = Pattern.compile("([^:]+):(\\d+):([^:]*):(\\d+):(\\d+):(-?\\d+):(-?\\d+)\\s+([123]):([YN]):(\\d+):(.*)");
private Text buffer = new Text();
private BaseQualityEncoding qualityEncoding;
private boolean filterFailedQC = false;
// How long can a read get?
private static final int MAX_LINE_LENGTH = 10000;
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException
{
setConf(conf);
file = split.getPath();
start = split.getStart();
end = start + split.getLength();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream fileIn = fs.open(file);
CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
CompressionCodec codec = codecFactory.getCodec(file);
if (codec == null) // no codec. Uncompressed file.
{
positionAtFirstRecord(fileIn);
inputStream = fileIn;
}
else
{ // compressed file
if (start != 0)
throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
inputStream = codec.createInputStream(fileIn);
end = Long.MAX_VALUE; // read until the end of the file
}
lineReader = new LineReader(inputStream);
}
protected void setConf(Configuration conf)
{
String encoding =
conf.get(FastqInputFormat.CONF_BASE_QUALITY_ENCODING,
conf.get(FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING,
FastqInputFormat.CONF_BASE_QUALITY_ENCODING_DEFAULT));
if ("illumina".equals(encoding))
qualityEncoding = BaseQualityEncoding.Illumina;
else if ("sanger".equals(encoding))
qualityEncoding = BaseQualityEncoding.Sanger;
else
throw new RuntimeException("Unknown input base quality encoding value " + encoding);
filterFailedQC = ConfHelper.parseBoolean(
conf.get(FastqInputFormat.CONF_FILTER_FAILED_QC,
conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)),
false);
}
/*
* Position the input stream at the start of the first record.
*/
private void positionAtFirstRecord(FSDataInputStream stream) throws IOException
{
if (start > 0)
{
// Advance to the start of the first record
// We use a temporary LineReader to read lines until we find the
// position of the right one. We then seek the file to that position.
stream.seek(start);
LineReader reader = new LineReader(stream);
int bytesRead = 0;
do
{
bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
if (bytesRead > 0 && (buffer.getLength() <= 0 || buffer.getBytes()[0] != '@'))
start += bytesRead;
else
{
// line starts with @. Read two more and verify that it starts with a +
//
// If this isn't the start of a record, we want to backtrack to its end
long backtrackPosition = start + bytesRead;
bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+')
break; // all good!
else
{
// backtrack to the end of the record we thought was the start.
start = backtrackPosition;
stream.seek(start);
reader = new LineReader(stream);
}
}
} while (bytesRead > 0);
stream.seek(start);
}
// else
// if start == 0 we presume it starts with a valid fastq record
pos = start;
}
/**
* Added to use mapreduce API.
*/
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException
{
}
/**
* Added to use mapreduce API.
*/
public Text getCurrentKey()
{
return currentKey;
}
/**
* Added to use mapreduce API.
*/
public SequencedFragment getCurrentValue()
{
return currentValue;
}
/**
* Added to use mapreduce API.
*/
public boolean nextKeyValue() throws IOException, InterruptedException
{
return next(currentKey, currentValue);
}
/**
* Close this RecordReader to future operations.
*/
public void close() throws IOException
{
inputStream.close();
}
/**
* Create an object of the appropriate type to be used as a key.
*/
public Text createKey()
{
return new Text();
}
/**
* Create an object of the appropriate type to be used as a value.
*/
public SequencedFragment createValue()
{
return new SequencedFragment();
}
/**
* Returns the current position in the input.
*/
public long getPos() { return pos; }
/**
* How much of the input has the RecordReader consumed i.e.
*/
public float getProgress()
{
if (start == end)
return 1.0f;
else
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
public String makePositionMessage()
{
return file.toString() + ":" + pos;
}
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException
{
// ID line
long skipped = lineReader.skip(1); // skip @
pos += skipped;
if (skipped == 0)
return false; // EOF
// ID
readLineInto(key);
// sequence
value.clear();
readLineInto(value.getSequence());
readLineInto(buffer);
if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+')
throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key);
readLineInto(value.getQuality());
// look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false
lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value);
if (!lookForIlluminaIdentifier)
scanNameForReadNumber(key, value);
return true;
}
/**
* Reads the next key/value pair from the input for processing.
*/
public boolean next(Text key, SequencedFragment value) throws IOException
{
if (pos >= end)
return false; // past end of slice
try
{
boolean gotData;
boolean goodRecord;
do {
gotData = lowLevelFastqRead(key, value);
goodRecord = gotData && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed());
} while (gotData && !goodRecord);
if (goodRecord) // goodRecord falso also when we couldn't read any more data
{
if (qualityEncoding == BaseQualityEncoding.Illumina)
{
try
{
// convert illumina to sanger scale
SequencedFragment.convertQuality(value.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger);
} catch (FormatException e) {
throw new FormatException(e.getMessage() + " Position: " + makePositionMessage() + "; Sequence ID: " + key);
}
}
else // sanger qualities.
{
int outOfRangeElement = SequencedFragment.verifyQuality(value.getQuality(), BaseQualityEncoding.Sanger);
if (outOfRangeElement >= 0)
{
throw new FormatException("fastq base quality score out of range for Sanger Phred+33 format (found " +
(value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" +
"Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n" +
"Position: " + makePositionMessage() + "; Sequence ID: " + key);
}
}
}
return goodRecord;
}
catch (EOFException e) {
throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage() + ". Id: " + key.toString());
}
}
private void scanNameForReadNumber(Text name, SequencedFragment fragment)
{
// look for a /[0-9] at the end of the name
if (name.getLength() >= 2)
{
byte[] bytes = name.getBytes();
int last = name.getLength() - 1;
if (bytes[last-1] == '/' && bytes[last] >= '0' && bytes[last] <= '9')
fragment.setRead(bytes[last] - '0');
}
}
private boolean scanIlluminaId(Text name, SequencedFragment fragment)
{
Matcher m = ILLUMINA_PATTERN.matcher(name.toString());
boolean matches = m.matches();
if (matches)
{
fragment.setInstrument(m.group(1));
fragment.setRunNumber(Integer.parseInt(m.group(2)));
fragment.setFlowcellId(m.group(3));
fragment.setLane(Integer.parseInt(m.group(4)));
fragment.setTile(Integer.parseInt(m.group(5)));
fragment.setXpos(Integer.parseInt(m.group(6)));
fragment.setYpos(Integer.parseInt(m.group(7)));
fragment.setRead(Integer.parseInt(m.group(8)));
fragment.setFilterPassed("N".equals(m.group(9)));
fragment.setControlNumber(Integer.parseInt(m.group(10)));
fragment.setIndexSequence(m.group(11));
}
return matches;
}
private int readLineInto(Text dest) throws EOFException, IOException
{
int bytesRead = lineReader.readLine(dest, MAX_LINE_LENGTH);
if (bytesRead <= 0)
throw new EOFException();
pos += bytesRead;
return bytesRead;
}
}
@Override
public boolean isSplitable(JobContext context, Path path)
{
CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path);
return codec == null;
}
public RecordReader createRecordReader(
InputSplit genericSplit,
TaskAttemptContext context) throws IOException, InterruptedException
{
context.setStatus(genericSplit.toString());
return new FastqRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat
}
}