All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seqdoop.hadoop_bam.VariantContextCodec Maven / Gradle / Ivy

Go to download

A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.

There is a newer version: 7.10.0
Show newest version
// Copyright (c) 2013 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2013-07-03 14:59:14

package org.seqdoop.hadoop_bam;

import java.io.DataOutput;
import java.io.DataInput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.lang.reflect.Array;

import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFEncoder;
import htsjdk.variant.vcf.VCFHeader;

// See the comment in VariantContextWritable explaining what this is used for.
public final class VariantContextCodec {
	public static void write(final DataOutput out, final VariantContext vc)
		throws IOException
	{
        Object genotypesData;
        int numGenotypes;
        if (vc.getGenotypes().isLazyWithData()) {
            final htsjdk.variant.variantcontext.LazyGenotypesContext gc =
                    (htsjdk.variant.variantcontext.LazyGenotypesContext)
                            vc.getGenotypes();

            genotypesData = gc.getUnparsedGenotypeData();
            numGenotypes = gc.size();
        }
        else if (vc instanceof VariantContextWithHeader) {

            final VCFHeader header = ((VariantContextWithHeader)vc).getHeader();

            if (header == null) {
                throw new IllegalStateException( "Header not set inside VariantContextWithHeader" );
            }

            final List genotypeAttributeKeys = vc.calcVCFGenotypeKeys(header);
            final StringBuilder builder = new StringBuilder();
            if ( ! genotypeAttributeKeys.isEmpty()) {
                // TODO: the VCFEncoder equivalent of this code checks for missing header fields here.  do we care?

                final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);

                builder.append(VCFConstants.FIELD_SEPARATOR);
                builder.append(genotypeFormatString);

                final VCFEncoder encoder = new VCFEncoder(header, true, false);
                final Map alleleStrings = encoder.buildAlleleStrings(vc);
                encoder.addGenotypeData(vc, alleleStrings, genotypeAttributeKeys, builder);
            }
            genotypesData = builder.toString();
            numGenotypes = vc.getGenotypes().size();
        }
        else {
            throw new IllegalStateException( "Cannot write fully decoded VariantContext: need lazy genotypes or VCF Header" );
        }

		if (!(genotypesData instanceof String || genotypesData instanceof BCF2Codec.LazyData))
			throw new IllegalStateException(
				"Unrecognized unparsed genotype data, expected String or "+
				"BCF2Codec.LazyData: "+ genotypesData.getClass());

		final byte[] chrom = vc.getContig().getBytes("UTF-8");
		out.writeInt(chrom.length);
		out.write   (chrom);

		out.writeInt(vc.getStart());
		out.writeInt(vc.getEnd());

		final byte[] id = vc.getID().getBytes("UTF-8");
		out.writeInt(id.length);
		out.write   (id);

		final List alleles = vc.getAlleles();
		out.writeInt(alleles.size());
		for (final Allele allele : alleles) {
			final byte[] b = allele.getDisplayBases();
			out.writeInt(b.length);
			out.write   (b);
		}

		if (vc.hasLog10PError())
			out.writeFloat((float)vc.getLog10PError());
		else {
			// The "missing value" used in BCF2, a signaling NaN.
			out.writeInt(0x7f800001);
		}

		if (vc.isFiltered()) {
			final Set filters = vc.getFilters();
			out.writeInt(filters.size());
			for (final String s : filters) {
				final byte[] b = s.getBytes("UTF-8");
				out.writeInt(b.length);
				out.write   (b);
			}
		} else
			out.writeInt(vc.filtersWereApplied() ? -1 : -2);

		final Map attrs = vc.getAttributes();
		out.writeInt(attrs.size());
		for (final Map.Entry ent : attrs.entrySet()) {
			final byte[] k = ent.getKey().getBytes("UTF-8");
			out.writeInt(k.length);
			out.write   (k);

			encodeAttrVal(out, ent.getValue());
		}

		out.writeInt(numGenotypes);

		if (genotypesData instanceof String) {
			out.writeByte(0);
			final byte[] genob = ((String)genotypesData).getBytes("UTF-8");
			out.writeInt(genob.length);
			out.write   (genob);
		} else {
			assert genotypesData instanceof BCF2Codec.LazyData;
			final BCF2Codec.LazyData data = (BCF2Codec.LazyData)genotypesData;
			out.writeByte(1);
			out.writeInt(data.bytes.length);
			out.write   (data.bytes);
			out.writeInt(data.nGenotypeFields);
		}
	}

	public static VariantContext read(final DataInput in) throws IOException {
		final VariantContextBuilder builder = new VariantContextBuilder();

		int count, len;
		byte[] b;

		len = in.readInt();
		b = new byte[len];
		in.readFully(b);
		final String chrom = new String(b, "UTF-8");
		builder.chr(chrom);

		final int start = in.readInt();
		builder.start(start);
		builder.stop (in.readInt());

		len = in.readInt();
		if (len == 0)
			builder.noID();
		else {
			if (len > b.length) b = new byte[len];
			in.readFully(b, 0, len);
			builder.id(new String(b, 0, len, "UTF-8"));
		}

		count = in.readInt();
		final List alleles = new ArrayList(count);
		for (int i = 0; i < count; ++i) {
			len = in.readInt();
			if (len > b.length) b = new byte[len];
			in.readFully(b, 0, len);
			alleles.add(Allele.create(Arrays.copyOf(b, len), i == 0));
		}
		builder.alleles(alleles);

		final int qualInt = in.readInt();
		builder.log10PError(
			qualInt == 0x7f800001
				? VariantContext.NO_LOG10_PERROR
				: Float.intBitsToFloat(qualInt));

		count = in.readInt();
		switch (count) {
		case -2: builder.unfiltered(); break;
		case -1: builder.passFilters(); break;
		default:
			while (count-- > 0) {
				len = in.readInt();
				if (len > b.length) b = new byte[len];
				in.readFully(b, 0, len);
				builder.filter(new String(b, 0, len, "UTF-8"));
			}
			break;
		}

		count = in.readInt();
		final Map attrs = new HashMap(count, 1);
		while (count-- > 0) {
			len = in.readInt();
			if (len > b.length) b = new byte[len];
			in.readFully(b, 0, len);
			attrs.put(new String(b, 0, len, "UTF-8"), decodeAttrVal(in));
		}
		builder.attributes(attrs);

		count = in.readInt();
		final byte genoType = in.readByte();
		len = in.readInt();

		// Resize b even if it's already big enough, minimizing the amount of
		// memory LazyGenotypesContext hangs on to.
		b = new byte[len];
		in.readFully(b);

		switch (genoType) {
		case 0:
			builder.genotypesNoValidation(
				new LazyVCFGenotypesContext(alleles, chrom, start, b, count));
			break;

		case 1:
			builder.genotypesNoValidation(
				new LazyBCFGenotypesContext(alleles, in.readInt(), b, count));
			break;

		default:
			throw new IOException(
				"Invalid genotypes type identifier: cannot decode");
		}

		return builder.make();
	}

	// The VCF 4.1 spec says: "Integer, Float, Flag, Character, and String". But
	// there can be many, so we also have ARRAY.
	//
	// In addition, VariantContext seems to represent some/all floats as doubles
	// at least when reading from BCF, and at least BCF2FieldEncoder assumes
	// them to be of class Double so we have to preserve doubles and thus must
	// have DOUBLE.
	private enum AttrType {
		INT, FLOAT, BOOL, CHAR, STRING, ARRAY, DOUBLE;

		public byte toByte() { return (byte)ordinal(); }

		private static final AttrType[] values = values();
		public static AttrType fromByte(byte b) { return values[b]; }
	}

	private static void encodeAttrVal(final DataOutput out, final Object v)
		throws IOException
	{
		if (v instanceof Integer) {
			out.writeByte(AttrType.INT.toByte());
			out.writeInt ((Integer)v);
		} else if (v instanceof Float) {
			out.writeByte (AttrType.FLOAT.toByte());
			out.writeFloat((Float)v);
		} else if (v instanceof Double) {
			out.writeByte  (AttrType.DOUBLE.toByte());
			out.writeDouble((Double)v);
		} else if (v instanceof Boolean) {
			out.writeByte   (AttrType.BOOL.toByte());
			out.writeBoolean((Boolean)v);
		} else if (v instanceof Character) {
			out.writeByte(AttrType.CHAR.toByte());
			out.writeChar((Character)v);

		} else if (v instanceof List) {
			encodeAttrVal(out, ((List)v).toArray());

		} else if (v != null && v.getClass().isArray()) {
			out.writeByte(AttrType.ARRAY.toByte());
			final int length = Array.getLength(v);
			out.writeInt(length);
			for (int i = 0; i < length; ++i)
				encodeAttrVal(out, Array.get(v, i));

		} else {
			out.writeByte(AttrType.STRING.toByte());
			if (v == null)
				out.writeInt(0);
			else {
				final byte[] b = v.toString().getBytes("UTF-8");
				out.writeInt(b.length);
				out.write   (b);
			}
		}
	}

	private static Object decodeAttrVal(final DataInput in) throws IOException {
		switch (AttrType.fromByte(in.readByte())) {
			case INT:    return in.readInt();
			case FLOAT:  return in.readFloat();
			case DOUBLE: return in.readDouble();
			case BOOL:   return in.readBoolean();
			case CHAR:   return in.readChar();
			case ARRAY: {
				// VariantContext.fullyDecodeAttributes() checks for "instanceof
				// List" so we have to return a List, not an array, here.
				int len = in.readInt();
				final List os = new ArrayList(len);
				while (len-- > 0)
					os.add(decodeAttrVal(in));
				return os;
			}
			case STRING: {
				final int len = in.readInt();
				if (len == 0)
					return null;
				final byte[] b = new byte[len];
				in.readFully(b);
				return new String(b, "UTF-8");
			}
		}
		assert (false);
		throw new IOException("Invalid type identifier: cannot decode");
	}
}