org.seqdoop.hadoop_bam.VariantContextCodec Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2013 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// File created: 2013-07-03 14:59:14
package org.seqdoop.hadoop_bam;
import java.io.DataOutput;
import java.io.DataInput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.lang.reflect.Array;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFEncoder;
import htsjdk.variant.vcf.VCFHeader;
// See the comment in VariantContextWritable explaining what this is used for.
public final class VariantContextCodec {
public static void write(final DataOutput out, final VariantContext vc)
throws IOException
{
Object genotypesData;
int numGenotypes;
if (vc.getGenotypes().isLazyWithData()) {
final htsjdk.variant.variantcontext.LazyGenotypesContext gc =
(htsjdk.variant.variantcontext.LazyGenotypesContext)
vc.getGenotypes();
genotypesData = gc.getUnparsedGenotypeData();
numGenotypes = gc.size();
}
else if (vc instanceof VariantContextWithHeader) {
final VCFHeader header = ((VariantContextWithHeader)vc).getHeader();
if (header == null) {
throw new IllegalStateException( "Header not set inside VariantContextWithHeader" );
}
final List genotypeAttributeKeys = vc.calcVCFGenotypeKeys(header);
final StringBuilder builder = new StringBuilder();
if ( ! genotypeAttributeKeys.isEmpty()) {
// TODO: the VCFEncoder equivalent of this code checks for missing header fields here. do we care?
final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
builder.append(VCFConstants.FIELD_SEPARATOR);
builder.append(genotypeFormatString);
final VCFEncoder encoder = new VCFEncoder(header, true, false);
final Map alleleStrings = encoder.buildAlleleStrings(vc);
encoder.addGenotypeData(vc, alleleStrings, genotypeAttributeKeys, builder);
}
genotypesData = builder.toString();
numGenotypes = vc.getGenotypes().size();
}
else {
throw new IllegalStateException( "Cannot write fully decoded VariantContext: need lazy genotypes or VCF Header" );
}
if (!(genotypesData instanceof String || genotypesData instanceof BCF2Codec.LazyData))
throw new IllegalStateException(
"Unrecognized unparsed genotype data, expected String or "+
"BCF2Codec.LazyData: "+ genotypesData.getClass());
final byte[] chrom = vc.getContig().getBytes("UTF-8");
out.writeInt(chrom.length);
out.write (chrom);
out.writeInt(vc.getStart());
out.writeInt(vc.getEnd());
final byte[] id = vc.getID().getBytes("UTF-8");
out.writeInt(id.length);
out.write (id);
final List alleles = vc.getAlleles();
out.writeInt(alleles.size());
for (final Allele allele : alleles) {
final byte[] b = allele.getDisplayBases();
out.writeInt(b.length);
out.write (b);
}
if (vc.hasLog10PError())
out.writeFloat((float)vc.getLog10PError());
else {
// The "missing value" used in BCF2, a signaling NaN.
out.writeInt(0x7f800001);
}
if (vc.isFiltered()) {
final Set filters = vc.getFilters();
out.writeInt(filters.size());
for (final String s : filters) {
final byte[] b = s.getBytes("UTF-8");
out.writeInt(b.length);
out.write (b);
}
} else
out.writeInt(vc.filtersWereApplied() ? -1 : -2);
final Map attrs = vc.getAttributes();
out.writeInt(attrs.size());
for (final Map.Entry ent : attrs.entrySet()) {
final byte[] k = ent.getKey().getBytes("UTF-8");
out.writeInt(k.length);
out.write (k);
encodeAttrVal(out, ent.getValue());
}
out.writeInt(numGenotypes);
if (genotypesData instanceof String) {
out.writeByte(0);
final byte[] genob = ((String)genotypesData).getBytes("UTF-8");
out.writeInt(genob.length);
out.write (genob);
} else {
assert genotypesData instanceof BCF2Codec.LazyData;
final BCF2Codec.LazyData data = (BCF2Codec.LazyData)genotypesData;
out.writeByte(1);
out.writeInt(data.bytes.length);
out.write (data.bytes);
out.writeInt(data.nGenotypeFields);
}
}
public static VariantContext read(final DataInput in) throws IOException {
final VariantContextBuilder builder = new VariantContextBuilder();
int count, len;
byte[] b;
len = in.readInt();
b = new byte[len];
in.readFully(b);
final String chrom = new String(b, "UTF-8");
builder.chr(chrom);
final int start = in.readInt();
builder.start(start);
builder.stop (in.readInt());
len = in.readInt();
if (len == 0)
builder.noID();
else {
if (len > b.length) b = new byte[len];
in.readFully(b, 0, len);
builder.id(new String(b, 0, len, "UTF-8"));
}
count = in.readInt();
final List alleles = new ArrayList(count);
for (int i = 0; i < count; ++i) {
len = in.readInt();
if (len > b.length) b = new byte[len];
in.readFully(b, 0, len);
alleles.add(Allele.create(Arrays.copyOf(b, len), i == 0));
}
builder.alleles(alleles);
final int qualInt = in.readInt();
builder.log10PError(
qualInt == 0x7f800001
? VariantContext.NO_LOG10_PERROR
: Float.intBitsToFloat(qualInt));
count = in.readInt();
switch (count) {
case -2: builder.unfiltered(); break;
case -1: builder.passFilters(); break;
default:
while (count-- > 0) {
len = in.readInt();
if (len > b.length) b = new byte[len];
in.readFully(b, 0, len);
builder.filter(new String(b, 0, len, "UTF-8"));
}
break;
}
count = in.readInt();
final Map attrs = new HashMap(count, 1);
while (count-- > 0) {
len = in.readInt();
if (len > b.length) b = new byte[len];
in.readFully(b, 0, len);
attrs.put(new String(b, 0, len, "UTF-8"), decodeAttrVal(in));
}
builder.attributes(attrs);
count = in.readInt();
final byte genoType = in.readByte();
len = in.readInt();
// Resize b even if it's already big enough, minimizing the amount of
// memory LazyGenotypesContext hangs on to.
b = new byte[len];
in.readFully(b);
switch (genoType) {
case 0:
builder.genotypesNoValidation(
new LazyVCFGenotypesContext(alleles, chrom, start, b, count));
break;
case 1:
builder.genotypesNoValidation(
new LazyBCFGenotypesContext(alleles, in.readInt(), b, count));
break;
default:
throw new IOException(
"Invalid genotypes type identifier: cannot decode");
}
return builder.make();
}
// The VCF 4.1 spec says: "Integer, Float, Flag, Character, and String". But
// there can be many, so we also have ARRAY.
//
// In addition, VariantContext seems to represent some/all floats as doubles
// at least when reading from BCF, and at least BCF2FieldEncoder assumes
// them to be of class Double so we have to preserve doubles and thus must
// have DOUBLE.
private enum AttrType {
INT, FLOAT, BOOL, CHAR, STRING, ARRAY, DOUBLE;
public byte toByte() { return (byte)ordinal(); }
private static final AttrType[] values = values();
public static AttrType fromByte(byte b) { return values[b]; }
}
private static void encodeAttrVal(final DataOutput out, final Object v)
throws IOException
{
if (v instanceof Integer) {
out.writeByte(AttrType.INT.toByte());
out.writeInt ((Integer)v);
} else if (v instanceof Float) {
out.writeByte (AttrType.FLOAT.toByte());
out.writeFloat((Float)v);
} else if (v instanceof Double) {
out.writeByte (AttrType.DOUBLE.toByte());
out.writeDouble((Double)v);
} else if (v instanceof Boolean) {
out.writeByte (AttrType.BOOL.toByte());
out.writeBoolean((Boolean)v);
} else if (v instanceof Character) {
out.writeByte(AttrType.CHAR.toByte());
out.writeChar((Character)v);
} else if (v instanceof List) {
encodeAttrVal(out, ((List)v).toArray());
} else if (v != null && v.getClass().isArray()) {
out.writeByte(AttrType.ARRAY.toByte());
final int length = Array.getLength(v);
out.writeInt(length);
for (int i = 0; i < length; ++i)
encodeAttrVal(out, Array.get(v, i));
} else {
out.writeByte(AttrType.STRING.toByte());
if (v == null)
out.writeInt(0);
else {
final byte[] b = v.toString().getBytes("UTF-8");
out.writeInt(b.length);
out.write (b);
}
}
}
private static Object decodeAttrVal(final DataInput in) throws IOException {
switch (AttrType.fromByte(in.readByte())) {
case INT: return in.readInt();
case FLOAT: return in.readFloat();
case DOUBLE: return in.readDouble();
case BOOL: return in.readBoolean();
case CHAR: return in.readChar();
case ARRAY: {
// VariantContext.fullyDecodeAttributes() checks for "instanceof
// List" so we have to return a List, not an array, here.
int len = in.readInt();
final List