org.seqdoop.hadoop_bam.SequencedFragment Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
package org.seqdoop.hadoop_bam;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;
public class SequencedFragment implements Writable
{
protected Text sequence = new Text();
protected Text quality = new Text();
protected String instrument;
protected Integer runNumber;
protected String flowcellId;
protected Integer lane;
protected Integer tile;
protected Integer xpos;
protected Integer ypos;
protected Integer read;
protected Boolean filterPassed;
protected Integer controlNumber;
protected String indexSequence;
// for serialization of nullable fiels
protected static final int Instrument_Present = 0x0001;
protected static final int RunNumber_Present = 0x0002;
protected static final int FlowcellId_Present = 0x0004;
protected static final int Lane_Present = 0x0008;
protected static final int Tile_Present = 0x0010;
protected static final int Xpos_Present = 0x0020;
protected static final int Ypos_Present = 0x0040;
protected static final int Read_Present = 0x0080;
protected static final int FilterPassed_Present = 0x0100;
protected static final int ControlNumber_Present = 0x0200;
protected static final int IndexSequence_Present = 0x0400;
public void clear()
{
sequence.clear();
quality.clear();
instrument = null;
runNumber = null;
flowcellId = null;
lane = null;
tile = null;
xpos = null;
ypos = null;
read = null;
filterPassed = null;
controlNumber = null;
indexSequence = null;
}
/**
* Get sequence Text object.
* Trade encapsulation for efficiency. Here we expose the internal Text
* object so that data may be read and written diretly from/to it.
*
* Sequence should always be written using CAPITAL letters and 'N' for unknown bases.
*/
public Text getSequence() { return sequence; }
/**
* Get quality Text object.
* Trade encapsulation for efficiency. Here we expose the internal Text
* object so that data may be read and written diretly from/to it.
*
* Quality should always be in ASCII-encoded Phred+33 format (sanger).
*/
public Text getQuality() { return quality; }
public void setInstrument(String v) { instrument = v; }
public void setRunNumber(Integer v) { runNumber = v; }
public void setFlowcellId(String v) { flowcellId = v; }
public void setLane(Integer v) { lane = v; }
public void setTile(Integer v) { tile = v; }
public void setXpos(Integer v) { xpos = v; }
public void setYpos(Integer v) { ypos = v; }
public void setRead(Integer v) { read = v; }
public void setFilterPassed(Boolean v) { filterPassed = v; }
public void setControlNumber(Integer v) { controlNumber = v; }
public void setIndexSequence(String v) { indexSequence = v; }
public void setSequence(Text seq)
{
if (seq == null)
throw new IllegalArgumentException("can't have a null sequence");
sequence = seq;
}
/**
* Set quality. Quality should be encoded in Sanger Phred+33 format.
*/
public void setQuality(Text qual)
{
if (qual == null)
throw new IllegalArgumentException("can't have a null quality");
quality = qual;
}
public String getInstrument() { return instrument; }
public Integer getRunNumber() { return runNumber; }
public String getFlowcellId() { return flowcellId; }
public Integer getLane() { return lane; }
public Integer getTile() { return tile; }
public Integer getXpos() { return xpos; }
public Integer getYpos() { return ypos; }
public Integer getRead() { return read; }
public Boolean getFilterPassed() { return filterPassed; }
public Integer getControlNumber() { return controlNumber; }
public String getIndexSequence() { return indexSequence; }
/**
* Recreates a pseudo qseq record with the fields available.
*/
public String toString()
{
String delim = "\t";
StringBuilder builder = new StringBuilder(800);
builder.append(instrument).append(delim);
builder.append(runNumber).append(delim);
builder.append(flowcellId).append(delim);
builder.append(lane).append(delim);
builder.append(tile).append(delim);
builder.append(xpos).append(delim);
builder.append(ypos).append(delim);
builder.append(indexSequence).append(delim);
builder.append(read).append(delim);
builder.append(sequence).append(delim);
builder.append(quality).append(delim);
builder.append((filterPassed == null || filterPassed) ? 1 : 0);
return builder.toString();
}
public boolean equals(Object other)
{
if (other != null && other instanceof SequencedFragment)
{
SequencedFragment otherFrag = (SequencedFragment)other;
if (instrument == null && otherFrag.instrument != null || instrument != null && !instrument.equals(otherFrag.instrument))
return false;
if (runNumber == null && otherFrag.runNumber != null || runNumber != null && !runNumber.equals(otherFrag.runNumber))
return false;
if (flowcellId == null && otherFrag.flowcellId != null || flowcellId != null && !flowcellId.equals(otherFrag.flowcellId))
return false;
if (lane == null && otherFrag.lane != null || lane != null && !lane.equals(otherFrag.lane))
return false;
if (tile == null && otherFrag.tile != null || tile != null && !tile.equals(otherFrag.tile))
return false;
if (xpos == null && otherFrag.xpos != null || xpos != null && !xpos.equals(otherFrag.xpos))
return false;
if (ypos == null && otherFrag.ypos != null || ypos != null && !ypos.equals(otherFrag.ypos))
return false;
if (read == null && otherFrag.read != null || read != null && !read.equals(otherFrag.read))
return false;
if (filterPassed == null && otherFrag.filterPassed != null || filterPassed != null && !filterPassed.equals(otherFrag.filterPassed))
return false;
if (controlNumber == null && otherFrag.controlNumber != null || controlNumber != null && !controlNumber.equals(otherFrag.controlNumber))
return false;
if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence))
return false;
// sequence and quality can't be null
if (!sequence.equals(otherFrag.sequence))
return false;
if (!quality.equals(otherFrag.quality))
return false;
return true;
}
else
return false;
}
@Override
public int hashCode() {
int result = sequence.hashCode();
result = 31 * result + quality.hashCode();
result = 31 * result + (instrument != null ? instrument.hashCode() : 0);
result = 31 * result + (runNumber != null ? runNumber.hashCode() : 0);
result = 31 * result + (flowcellId != null ? flowcellId.hashCode() : 0);
result = 31 * result + (lane != null ? lane.hashCode() : 0);
result = 31 * result + (tile != null ? tile.hashCode() : 0);
result = 31 * result + (xpos != null ? xpos.hashCode() : 0);
result = 31 * result + (ypos != null ? ypos.hashCode() : 0);
result = 31 * result + (read != null ? read.hashCode() : 0);
result = 31 * result + (filterPassed != null ? filterPassed.hashCode() : 0);
result = 31 * result + (controlNumber != null ? controlNumber.hashCode() : 0);
result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0);
return result;
}
/**
* Convert quality scores in-place.
*
* @throws FormatException if quality scores are out of the range
* allowed by the current encoding.
* @throws IllegalArgumentException if current and target quality encodings are the same.
*/
public static void convertQuality(Text quality, BaseQualityEncoding current, BaseQualityEncoding target)
{
if (current == target)
throw new IllegalArgumentException("current and target quality encodinds are the same (" + current + ")");
byte[] bytes = quality.getBytes();
final int len = quality.getLength();
final int illuminaSangerDistance = FormatConstants.ILLUMINA_OFFSET - FormatConstants.SANGER_OFFSET;
if (current == BaseQualityEncoding.Illumina && target == BaseQualityEncoding.Sanger)
{
for (int i = 0; i < len; ++i)
{
if (bytes[i] < FormatConstants.ILLUMINA_OFFSET || bytes[i] > (FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX))
{
throw new FormatException(
"base quality score out of range for Illumina Phred+64 format (found " + (bytes[i] - FormatConstants.ILLUMINA_OFFSET) +
" but acceptable range is [0," + FormatConstants.ILLUMINA_MAX + "]).\n" +
"Maybe qualities are encoded in Sanger format?\n");
}
bytes[i] -= illuminaSangerDistance;
}
}
else if (current == BaseQualityEncoding.Sanger && target == BaseQualityEncoding.Illumina)
{
for (int i = 0; i < len; ++i)
{
if (bytes[i] < FormatConstants.SANGER_OFFSET || bytes[i] > (FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX))
{
throw new FormatException(
"base quality score out of range for Sanger Phred+64 format (found " + (bytes[i] - FormatConstants.SANGER_OFFSET) +
" but acceptable range is [0," + FormatConstants.SANGER_MAX + "]).\n" +
"Maybe qualities are encoded in Illumina format?\n");
}
bytes[i] += illuminaSangerDistance;
}
}
else
throw new IllegalArgumentException("unsupported BaseQualityEncoding transformation from " + current + " to " + target);
}
/**
* Verify that the given quality bytes are within the range allowed for the specified encoding.
*
* In theory, the Sanger encoding uses the entire
* range of characters from ASCII 33 to 126, giving a value range of [0,93]. However, values over 60 are
* unlikely in practice, and are more likely to be caused by mistaking a file that uses Illumina encoding
* for Sanger. So, we'll enforce the same range supported by Illumina encoding ([0,62]) for Sanger.
*
* @return -1 if quality is ok.
* @return If an out-of-range value is found the index of the value is returned.
*/
public static int verifyQuality(Text quality, BaseQualityEncoding encoding)
{
// set allowed quality range
int max, min;
if (encoding == BaseQualityEncoding.Illumina)
{
max = FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX;
min = FormatConstants.ILLUMINA_OFFSET;
}
else if (encoding == BaseQualityEncoding.Sanger)
{
max = FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX;
min = FormatConstants.SANGER_OFFSET;
}
else
throw new IllegalArgumentException("Unsupported base encoding quality " + encoding);
// verify
final byte[] bytes = quality.getBytes();
final int len = quality.getLength();
for (int i = 0; i < len; ++i)
{
if (bytes[i] < min || bytes[i] > max)
return i;
}
return -1;
}
public void readFields(DataInput in) throws IOException
{
// TODO: reimplement with a serialization system (e.g. Avro)
// serialization order:
// 1) sequence
// 2) quality
// 3) int with flags indicating which fields are defined (see *_Present flags)
// 4..end) the rest of the fields
this.clear();
sequence.readFields(in);
quality.readFields(in);
int presentFlags = WritableUtils.readVInt(in);
if ( (presentFlags & Instrument_Present) != 0) instrument = WritableUtils.readString(in);
if ( (presentFlags & RunNumber_Present) != 0) runNumber = WritableUtils.readVInt(in);
if ( (presentFlags & FlowcellId_Present) != 0) flowcellId = WritableUtils.readString(in);
if ( (presentFlags & Lane_Present) != 0) lane = WritableUtils.readVInt(in);
if ( (presentFlags & Tile_Present) != 0) tile = WritableUtils.readVInt(in);
if ( (presentFlags & Xpos_Present) != 0) xpos = WritableUtils.readVInt(in);
if ( (presentFlags & Ypos_Present) != 0) ypos = WritableUtils.readVInt(in);
if ( (presentFlags & Read_Present) != 0) read = WritableUtils.readVInt(in);
if ( (presentFlags & FilterPassed_Present) != 0) filterPassed = WritableUtils.readVInt(in) == 1;
if ( (presentFlags & ControlNumber_Present) != 0) controlNumber = WritableUtils.readVInt(in);
if ( (presentFlags & IndexSequence_Present) != 0) indexSequence = WritableUtils.readString(in);
}
public void write(DataOutput out) throws IOException
{
// TODO: reimplement with a serialization system (e.g. Avro)
sequence.write(out);
quality.write(out);
int presentFlags = 0;
if (instrument != null) presentFlags |= Instrument_Present;
if (runNumber != null) presentFlags |= RunNumber_Present;
if (flowcellId != null) presentFlags |= FlowcellId_Present;
if (lane != null) presentFlags |= Lane_Present;
if (tile != null) presentFlags |= Tile_Present;
if (xpos != null) presentFlags |= Xpos_Present;
if (ypos != null) presentFlags |= Ypos_Present;
if (read != null) presentFlags |= Read_Present;
if (filterPassed != null) presentFlags |= FilterPassed_Present;
if (controlNumber != null) presentFlags |= ControlNumber_Present;
if (indexSequence != null) presentFlags |= IndexSequence_Present;
WritableUtils.writeVInt(out, presentFlags);
if (instrument != null) WritableUtils.writeString(out, instrument);
if (runNumber != null) WritableUtils.writeVInt(out, runNumber);
if (flowcellId != null) WritableUtils.writeString(out, flowcellId);
if (lane != null) WritableUtils.writeVInt(out, lane);
if (tile != null) WritableUtils.writeVInt(out, tile);
if (xpos != null) WritableUtils.writeVInt(out, xpos);
if (ypos != null) WritableUtils.writeVInt(out, ypos);
if (read != null) WritableUtils.writeVInt(out, read);
if (filterPassed != null) WritableUtils.writeVInt(out, filterPassed ? 1 : 0);
if (controlNumber != null) WritableUtils.writeVInt(out, controlNumber);
if (indexSequence != null) WritableUtils.writeString(out, indexSequence);
}
}