All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seqdoop.hadoop_bam.SequencedFragment Maven / Gradle / Ivy

Go to download

A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.

There is a newer version: 7.10.0
Show newest version
// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

package org.seqdoop.hadoop_bam;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;

import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;

public class SequencedFragment implements Writable
{
	protected Text sequence = new Text();
	protected Text quality = new Text();

	protected String instrument;
	protected Integer runNumber;
	protected String flowcellId;
	protected Integer lane;
	protected Integer tile;
	protected Integer xpos;
	protected Integer ypos;
	protected Integer read;
	protected Boolean filterPassed;
	protected Integer controlNumber;
	protected String indexSequence;

	// for serialization of nullable fiels
	protected static final int Instrument_Present     = 0x0001;
	protected static final int RunNumber_Present      = 0x0002;
	protected static final int FlowcellId_Present     = 0x0004;
	protected static final int Lane_Present           = 0x0008;
	protected static final int Tile_Present           = 0x0010;
	protected static final int Xpos_Present           = 0x0020;
	protected static final int Ypos_Present           = 0x0040;
	protected static final int Read_Present           = 0x0080;
	protected static final int FilterPassed_Present   = 0x0100;
	protected static final int ControlNumber_Present  = 0x0200;
	protected static final int IndexSequence_Present  = 0x0400;

	public void clear()
	{
		sequence.clear();
		quality.clear();

		instrument = null;
		runNumber = null;
		flowcellId = null;
		lane = null;
		tile = null;
		xpos = null;
		ypos = null;
		read = null;
		filterPassed = null;
		controlNumber = null;
		indexSequence = null;
	}

	/**
	 * Get sequence Text object.
	 * Trade encapsulation for efficiency.  Here we expose the internal Text
	 * object so that data may be read and written diretly from/to it.
	 *
	 * Sequence should always be written using CAPITAL letters and 'N' for unknown bases.
	 */
	public Text getSequence() { return sequence; }

	/**
	 * Get quality Text object.
	 * Trade encapsulation for efficiency.  Here we expose the internal Text
	 * object so that data may be read and written diretly from/to it.
	 *
	 * Quality should always be in ASCII-encoded Phred+33 format (sanger).
	 */
	public Text getQuality() { return quality; }

	public void setInstrument(String v) { instrument = v; }
	public void setRunNumber(Integer v) { runNumber = v; }
	public void setFlowcellId(String v) { flowcellId = v; }
	public void setLane(Integer v) { lane = v; }
	public void setTile(Integer v) { tile = v; }
	public void setXpos(Integer v) { xpos = v; }
	public void setYpos(Integer v) { ypos = v; }
	public void setRead(Integer v) { read = v; }
	public void setFilterPassed(Boolean v) { filterPassed = v; }
	public void setControlNumber(Integer v) { controlNumber = v; }
	public void setIndexSequence(String v) { indexSequence = v; }

	public void setSequence(Text seq)
	{
		if (seq == null)
			throw new IllegalArgumentException("can't have a null sequence");
	 	sequence = seq;
	}

	/**
	 * Set quality.  Quality should be encoded in Sanger Phred+33 format.
	 */
	public void setQuality(Text qual)
	{
		if (qual == null)
			throw new IllegalArgumentException("can't have a null quality");
	 	quality = qual;
	}

	public String getInstrument() { return instrument; }
	public Integer getRunNumber() { return runNumber; }
	public String getFlowcellId() { return flowcellId; }
	public Integer getLane() { return lane; }
	public Integer getTile() { return tile; }
	public Integer getXpos() { return xpos; }
	public Integer getYpos() { return ypos; }
	public Integer getRead() { return read; }
	public Boolean getFilterPassed() { return filterPassed; }
	public Integer getControlNumber() { return controlNumber; }
	public String getIndexSequence() { return indexSequence; }

	/**
	 * Recreates a pseudo qseq record with the fields available.
	 */
	public String toString()
	{
		String delim = "\t";
		StringBuilder builder = new StringBuilder(800);
		builder.append(instrument).append(delim);
		builder.append(runNumber).append(delim);
		builder.append(flowcellId).append(delim);
		builder.append(lane).append(delim);
		builder.append(tile).append(delim);
		builder.append(xpos).append(delim);
		builder.append(ypos).append(delim);
		builder.append(indexSequence).append(delim);
		builder.append(read).append(delim);
		builder.append(sequence).append(delim);
		builder.append(quality).append(delim);
		builder.append((filterPassed == null || filterPassed) ? 1 : 0);
		return builder.toString();
	}

	public boolean equals(Object other)
	{
		if (other != null && other instanceof SequencedFragment)
		{
			SequencedFragment otherFrag = (SequencedFragment)other;

			if (instrument == null && otherFrag.instrument != null || instrument != null && !instrument.equals(otherFrag.instrument))
				return false;
			if (runNumber == null && otherFrag.runNumber != null || runNumber != null && !runNumber.equals(otherFrag.runNumber))
				return false;
			if (flowcellId == null && otherFrag.flowcellId != null || flowcellId != null && !flowcellId.equals(otherFrag.flowcellId))
				return false;
			if (lane == null && otherFrag.lane != null || lane != null && !lane.equals(otherFrag.lane))
				return false;
			if (tile == null && otherFrag.tile != null || tile != null && !tile.equals(otherFrag.tile))
				return false;
			if (xpos == null && otherFrag.xpos != null || xpos != null && !xpos.equals(otherFrag.xpos))
				return false;
			if (ypos == null && otherFrag.ypos != null || ypos != null && !ypos.equals(otherFrag.ypos))
				return false;
			if (read == null && otherFrag.read != null || read != null && !read.equals(otherFrag.read))
				return false;
			if (filterPassed == null && otherFrag.filterPassed != null || filterPassed != null && !filterPassed.equals(otherFrag.filterPassed))
				return false;
			if (controlNumber == null && otherFrag.controlNumber != null || controlNumber != null && !controlNumber.equals(otherFrag.controlNumber))
				return false;
			if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence))
				return false;
			// sequence and quality can't be null
			if (!sequence.equals(otherFrag.sequence))
				return false;
			if (!quality.equals(otherFrag.quality))
				return false;

			return true;
		}
		else
			return false;
	}

	@Override
	public int hashCode() {
		int result = sequence.hashCode();
		result = 31 * result + quality.hashCode();
		result = 31 * result + (instrument != null ? instrument.hashCode() : 0);
		result = 31 * result + (runNumber != null ? runNumber.hashCode() : 0);
		result = 31 * result + (flowcellId != null ? flowcellId.hashCode() : 0);
		result = 31 * result + (lane != null ? lane.hashCode() : 0);
		result = 31 * result + (tile != null ? tile.hashCode() : 0);
		result = 31 * result + (xpos != null ? xpos.hashCode() : 0);
		result = 31 * result + (ypos != null ? ypos.hashCode() : 0);
		result = 31 * result + (read != null ? read.hashCode() : 0);
		result = 31 * result + (filterPassed != null ? filterPassed.hashCode() : 0);
		result = 31 * result + (controlNumber != null ? controlNumber.hashCode() : 0);
		result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0);
		return result;
	}

	/**
	 * Convert quality scores in-place.
	 *
	 * @throws FormatException if quality scores are out of the range
	 * allowed by the current encoding.
	 * @throws IllegalArgumentException if current and  target quality encodings are the same.
	 */
	public static void convertQuality(Text quality, BaseQualityEncoding current, BaseQualityEncoding target)
	{
		if (current == target)
			throw new IllegalArgumentException("current and target quality encodinds are the same (" + current + ")");

		byte[] bytes = quality.getBytes();
		final int len = quality.getLength();
		final int illuminaSangerDistance = FormatConstants.ILLUMINA_OFFSET - FormatConstants.SANGER_OFFSET;

		if (current == BaseQualityEncoding.Illumina && target == BaseQualityEncoding.Sanger)
		{
			for (int i = 0; i < len; ++i)
			{
				if (bytes[i] < FormatConstants.ILLUMINA_OFFSET || bytes[i] > (FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX))
				{
					throw new FormatException(
							"base quality score out of range for Illumina Phred+64 format (found " + (bytes[i] - FormatConstants.ILLUMINA_OFFSET) +
							" but acceptable range is [0," + FormatConstants.ILLUMINA_MAX + "]).\n" +
							"Maybe qualities are encoded in Sanger format?\n");
				}
				bytes[i] -= illuminaSangerDistance;
			}
		}
		else if (current == BaseQualityEncoding.Sanger && target == BaseQualityEncoding.Illumina)
		{
			for (int i = 0; i < len; ++i)
			{
				if (bytes[i] < FormatConstants.SANGER_OFFSET || bytes[i] > (FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX))
				{
					throw new FormatException(
							"base quality score out of range for Sanger Phred+64 format (found " + (bytes[i] - FormatConstants.SANGER_OFFSET) +
							" but acceptable range is [0," + FormatConstants.SANGER_MAX + "]).\n" +
							"Maybe qualities are encoded in Illumina format?\n");
				}
				bytes[i] += illuminaSangerDistance;
			}
		}
		else
			throw new IllegalArgumentException("unsupported BaseQualityEncoding transformation from " + current + " to " + target);
	}

	/**
	 * Verify that the given quality bytes are within the range allowed for the specified encoding.
	 *
	 * In theory, the Sanger encoding uses the entire
	 * range of characters from ASCII 33 to 126, giving a value range of [0,93].  However, values over 60 are
	 * unlikely in practice, and are more likely to be caused by mistaking a file that uses Illumina encoding
	 * for Sanger.  So, we'll enforce the same range supported by Illumina encoding ([0,62]) for Sanger.
	 *
	 * @return -1 if quality is ok.
	 * @return If an out-of-range value is found the index of the value is returned.
	 */
	public static int verifyQuality(Text quality, BaseQualityEncoding encoding)
	{
		// set allowed quality range
		int max, min;

		if (encoding == BaseQualityEncoding.Illumina)
		{
			max = FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX;
			min = FormatConstants.ILLUMINA_OFFSET;
		}
		else if (encoding == BaseQualityEncoding.Sanger)
		{
			max = FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX;
			min = FormatConstants.SANGER_OFFSET;
		}
		else
			throw new IllegalArgumentException("Unsupported base encoding quality " + encoding);

		// verify
		final byte[] bytes = quality.getBytes();
		final int len = quality.getLength();

		for (int i = 0; i < len; ++i)
		{
			if (bytes[i] < min || bytes[i] > max)
				return i;
		}
		return -1;
	}

	public void readFields(DataInput in) throws IOException
	{
		// TODO:  reimplement with a serialization system (e.g. Avro)

		// serialization order:
		// 1) sequence
		// 2) quality
		// 3) int with flags indicating which fields are defined (see *_Present flags)
		// 4..end) the rest of the fields

		this.clear();

		sequence.readFields(in);
		quality.readFields(in);

		int presentFlags = WritableUtils.readVInt(in);
		if ( (presentFlags & Instrument_Present) != 0) instrument = WritableUtils.readString(in);
		if ( (presentFlags & RunNumber_Present) != 0) runNumber = WritableUtils.readVInt(in);
		if ( (presentFlags & FlowcellId_Present) != 0) flowcellId = WritableUtils.readString(in);
		if ( (presentFlags & Lane_Present) != 0) lane = WritableUtils.readVInt(in);
		if ( (presentFlags & Tile_Present) != 0) tile = WritableUtils.readVInt(in);
		if ( (presentFlags & Xpos_Present) != 0) xpos = WritableUtils.readVInt(in);
		if ( (presentFlags & Ypos_Present) != 0) ypos = WritableUtils.readVInt(in);
		if ( (presentFlags & Read_Present) != 0) read = WritableUtils.readVInt(in);
		if ( (presentFlags & FilterPassed_Present) != 0) filterPassed = WritableUtils.readVInt(in) == 1;
		if ( (presentFlags & ControlNumber_Present) != 0) controlNumber = WritableUtils.readVInt(in);
		if ( (presentFlags & IndexSequence_Present) != 0) indexSequence = WritableUtils.readString(in);
	}

	public void write(DataOutput out) throws IOException
	{
		// TODO:  reimplement with a serialization system (e.g. Avro)

		sequence.write(out);
		quality.write(out);

		int presentFlags = 0;
		if (instrument != null) presentFlags |= Instrument_Present;
		if (runNumber != null) presentFlags |= RunNumber_Present;
		if (flowcellId != null) presentFlags |= FlowcellId_Present;
		if (lane != null) presentFlags |= Lane_Present;
		if (tile != null) presentFlags |= Tile_Present;
		if (xpos != null) presentFlags |= Xpos_Present;
		if (ypos != null) presentFlags |= Ypos_Present;
		if (read != null) presentFlags |= Read_Present;
		if (filterPassed != null) presentFlags |= FilterPassed_Present;
		if (controlNumber != null) presentFlags |= ControlNumber_Present;
		if (indexSequence != null) presentFlags |= IndexSequence_Present;

		WritableUtils.writeVInt(out, presentFlags);

		if (instrument != null) WritableUtils.writeString(out, instrument);
		if (runNumber != null) WritableUtils.writeVInt(out, runNumber);
		if (flowcellId != null) WritableUtils.writeString(out, flowcellId);
		if (lane != null) WritableUtils.writeVInt(out, lane);
		if (tile != null) WritableUtils.writeVInt(out, tile);
		if (xpos != null) WritableUtils.writeVInt(out, xpos);
		if (ypos != null) WritableUtils.writeVInt(out, ypos);
		if (read != null) WritableUtils.writeVInt(out, read);
		if (filterPassed != null) WritableUtils.writeVInt(out, filterPassed ? 1 : 0);
		if (controlNumber != null) WritableUtils.writeVInt(out, controlNumber);
		if (indexSequence != null) WritableUtils.writeString(out, indexSequence);
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy