All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seqdoop.hadoop_bam.FastqOutputFormat Maven / Gradle / Ivy

Go to download

A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.

There is a newer version: 7.10.0
Show newest version
// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Hadoop-BAM.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

package org.seqdoop.hadoop_bam;

import java.io.DataOutputStream;
import java.io.OutputStream;
import java.io.IOException;

import java.nio.charset.Charset;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;

import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;

/**
 * Output format for the fastq format.
 */
// If a key is provided with the SequencedFragment, the key is used as the sequence
// id and the meta-info from the SequencedFragment (if any) is dropped.
// If the key is null, then the format will attempt to create an
// Illumina-style fastq id as specified in the Casava users' guide v1.8:
// @instrument:run number:flowcell ID:lane:tile:x-pos:y-pos \s+ read:is filtered:control number:index sequence
//
public class FastqOutputFormat extends TextOutputFormat
{
	public static final String CONF_BASE_QUALITY_ENCODING         = "hbam.fastq-output.base-quality-encoding";
	public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger";
	public static final Charset UTF8 = Charset.forName("UTF8");

	static final byte[] PLUS_LINE;
	static {
		try {
			PLUS_LINE = "\n+\n".getBytes("us-ascii");
		} catch (java.io.UnsupportedEncodingException e) {
			throw new RuntimeException("us-ascii encoding not supported!");
		}
	}

	public static class FastqRecordWriter extends RecordWriter
	{
		protected StringBuilder       sBuilder          = new StringBuilder(800);
		protected Text                buffer            = new Text();
		protected OutputStream        out;
		protected BaseQualityEncoding baseQualityFormat;

		public FastqRecordWriter(Configuration conf, OutputStream out)
		{
			this.out = out;
			setConf(conf);
		}

		public void setConf(Configuration conf)
		{
			String setting = conf.get(CONF_BASE_QUALITY_ENCODING, CONF_BASE_QUALITY_ENCODING_DEFAULT);
			if ("illumina".equals(setting))
				baseQualityFormat = BaseQualityEncoding.Illumina;
			else if ("sanger".equals(setting))
				baseQualityFormat = BaseQualityEncoding.Sanger;
			else
				throw new RuntimeException("Invalid property value '" + setting + "' for " + CONF_BASE_QUALITY_ENCODING + ".  Valid values are 'illumina' or 'sanger'");
		}

		protected String makeId(SequencedFragment seq) throws IOException
		{
			String delim = ":";
			sBuilder.delete(0, sBuilder.length()); // clear

			sBuilder.append( seq.getInstrument() == null ? "" : seq.getInstrument() ).append(delim);
			sBuilder.append( seq.getRunNumber()  == null ? "" : seq.getRunNumber().toString() ).append(delim);
			sBuilder.append( seq.getFlowcellId() == null ? "" : seq.getFlowcellId() ).append(delim);
			sBuilder.append( seq.getLane()       == null ? "" : seq.getLane().toString() ).append(delim);
			sBuilder.append( seq.getTile()       == null ? "" : seq.getTile().toString() ).append(delim);
			sBuilder.append( seq.getXpos()       == null ? "" : seq.getXpos().toString() ).append(delim);
			sBuilder.append( seq.getYpos()       == null ? "" : seq.getYpos().toString() );

			sBuilder.append(" "); // space

			sBuilder.append( seq.getRead()       == null ? "" : seq.getRead().toString() ).append(delim);
			sBuilder.append(seq.getFilterPassed() == null || seq.getFilterPassed() ? "N" : "Y");
			sBuilder.append(delim);

			sBuilder.append( seq.getControlNumber() == null ? "0" : seq.getControlNumber().toString()).append(delim);
			sBuilder.append( seq.getIndexSequence() == null ? "" : seq.getIndexSequence());

			return sBuilder.toString();
		}

		public void write(Text key, SequencedFragment seq) throws IOException
		{
			// write the id line
			out.write('@');
			if (key != null)
				out.write(key.getBytes(), 0, key.getLength());
			else
				out.write(makeId(seq).getBytes(UTF8));
			out.write('\n');

			// write the sequence and separator
			out.write(seq.getSequence().getBytes(), 0, seq.getSequence().getLength());
			out.write(PLUS_LINE);

			// now the quality
			if (baseQualityFormat == BaseQualityEncoding.Sanger)
				out.write(seq.getQuality().getBytes(), 0, seq.getQuality().getLength());
			else if (baseQualityFormat == BaseQualityEncoding.Illumina)
			{
				buffer.set(seq.getQuality());
				SequencedFragment.convertQuality(buffer, BaseQualityEncoding.Sanger, baseQualityFormat);
				out.write(buffer.getBytes(), 0, buffer.getLength());
			}
			else
				throw new RuntimeException("FastqOutputFormat: unknown base quality format " + baseQualityFormat);

			// and the final newline
			out.write('\n');
		}

		public void close(TaskAttemptContext task) throws IOException
		{
			out.close();
		}
  }

	public RecordWriter getRecordWriter(TaskAttemptContext task)
	  throws IOException
	{
		Configuration conf = task.getConfiguration();
		boolean isCompressed = getCompressOutput(task);

		CompressionCodec codec = null;
		String extension = "";

		if (isCompressed)
		{
			Class codecClass = getOutputCompressorClass(task, GzipCodec.class);
			codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
			extension = codec.getDefaultExtension();
		}

		Path file = getDefaultWorkFile(task, extension);
		FileSystem fs = file.getFileSystem(conf);

		OutputStream output;

		if (isCompressed)
		{
			FSDataOutputStream fileOut = fs.create(file, false);
			output = new DataOutputStream(codec.createOutputStream(fileOut));
		}
		else
			output = fs.create(file, false);

		return new FastqRecordWriter(conf, output);
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy