org.opencb.biodata.tools.variant.merge.VariantAlternateRearranger Maven / Gradle / Ivy
package org.opencb.biodata.tools.variant.merge;
import htsjdk.variant.vcf.*;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.opencb.biodata.models.variant.Genotype;
import org.opencb.biodata.models.variant.metadata.VariantFileHeader;
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine;
import org.opencb.biodata.tools.variant.converters.avro.VariantFileHeaderToVCFHeaderConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.util.*;
import java.util.function.Consumer;
/**
* Created on 16/06/17.
*
* @author Jacobo Coll <[email protected]>
*/
public class VariantAlternateRearranger {
// Default ploidy as defined in vcf-spec
public static final int DEFAULT_PLOIDY = 2;
// Map from reordered allele position to original allele position
private final int[] map;
// Map from original allele position to reordered allele position
private final int[] reverseMap;
private final Configuration configuration;
// Lazy initialization
private int[] gMapP1; // Ploidy = 1
private int[] gMapP2; // Ploidy = 2
private Map gMapP; // Ploidy = key
protected Logger logger = LoggerFactory.getLogger(this.getClass().toString());
public VariantAlternateRearranger(List originalAlternates, List reorderedAlternates) {
this(originalAlternates, reorderedAlternates, null);
}
public VariantAlternateRearranger(List originalAlternates, List reorderedAlternates,
Configuration configuration) {
this.configuration = configuration == null ? new Configuration() : configuration;
// if (originalAlternates.size() > reorderedAlternates.size()) {
// throw new IllegalArgumentException("Expected all original alternates in reordered alternates");
// }
this.map = new int[reorderedAlternates.size()];
this.reverseMap = new int[originalAlternates.size()];
for (int i = 0; i < reorderedAlternates.size(); i++) {
map[i] = originalAlternates.indexOf(reorderedAlternates.get(i));
}
for (int i = 0; i < originalAlternates.size(); i++) {
reverseMap[i] = reorderedAlternates.indexOf(originalAlternates.get(i));
// if (reverseMap[i] < 0) {
// throw new IllegalArgumentException("Missing alternate in list of reordered alternates");
// }
}
}
public Genotype rearrangeGenotype(String genotypeString, String reference, List alternateAlleles) {
return rearrangeGenotype(new Genotype(genotypeString, reference, alternateAlleles));
}
public Genotype rearrangeGenotype(Genotype genotype) {
int[] allelesIdx = genotype.getAllelesIdx();
for (int i = 0; i < allelesIdx.length; i++) {
allelesIdx[i] = remapAlleleReverse(allelesIdx[i]);
}
return genotype;
}
public String rearrangeNumberR(String data) {
return rearrangeNumberR(data, ".");
}
public String rearrangeNumberR(String data, String missingValue) {
// List values = Arrays.asList(StringUtils.splitPreserveAllTokens(data, ','));
List values = Arrays.asList(data.split(",", -1));
return rearrange(values, missingValue, true, ",", map);
}
public List rearrangeNumberR(List values, T missingValue) {
return rearrange(values, missingValue, true, map);
}
public String rearrangeNumberA(String data) {
return rearrangeNumberA(data, ".");
}
public String rearrangeNumberA(String data, String missingValue) {
// List values = Arrays.asList(StringUtils.splitPreserveAllTokens(data, ','));
List values = Arrays.asList(data.split(",", -1));
return rearrange(values, missingValue, false, ",", map);
}
public List rearrangeNumberA(List values, T missingValue) {
return rearrange(values, missingValue, false, map);
}
public String rearrangeNumberG(String data, String missingValue, @Nullable Integer ploidy) {
int[] gMap = getGenotypeReorderingMap(ploidy);
// List values = Arrays.asList(StringUtils.splitPreserveAllTokens(data, ','));
List values = Arrays.asList(data.split(",", -1));
return rearrange(values, missingValue, false, ",", gMap);
}
public List rearrangeNumberG(List values, T missingValue, @Nullable Integer ploidy) {
int[] gMap = getGenotypeReorderingMap(ploidy);
return rearrange(values, missingValue, false, gMap);
}
private List rearrange(List originalValues, T missingValue, boolean includeReference, int[] map) {
List rearrangedValues = new ArrayList<>(includeReference ? this.map.length + 1 : this.map.length);
rearrange(originalValues, missingValue, rearrangedValues::add, includeReference, map);
return rearrangedValues;
}
private String rearrange(List originalValues, String missingValue, boolean includeReference, String separator, int[] map) {
StringBuilder sb = new StringBuilder();
rearrange(originalValues, missingValue, str -> {
if (sb.length() > 0) {
sb.append(separator);
}
sb.append(str);
}, includeReference, map);
return sb.toString();
}
private void rearrange(List originalValues, T missingValue, Consumer consumer, boolean includeReference, int[] map) {
int expectedLength = map.length;
if (includeReference) {
expectedLength++;
}
if (originalValues.size() > expectedLength) {
throw new IllegalArgumentException("Expected no more than " + expectedLength + " values. "
+ "Got " + originalValues.size() + " : " + originalValues);
}
if (includeReference) {
consumer.accept(originalValues.get(0));
}
for (int originalPosition : map) {
if (originalPosition < 0) {
consumer.accept(missingValue);
} else {
if (includeReference) {
originalPosition++;
}
if (originalPosition < originalValues.size()) {
consumer.accept(originalValues.get(originalPosition));
} else {
consumer.accept(missingValue);
}
}
}
}
private int[] getGenotypeReorderingMap(@Nullable Integer ploidy) {
if (ploidy == null || ploidy <= 0) {
// Ploidy 2 is assumed when missing.
ploidy = DEFAULT_PLOIDY;
}
// Lazy initialization
switch (ploidy) {
case 1:
if (gMapP1 == null) {
gMapP1 = getGenotypeReorderingMap(ploidy, map);
}
return gMapP1;
case 2:
if (gMapP2 == null) {
gMapP2 = getGenotypeReorderingMap(ploidy, map);
}
return gMapP2;
default:
if (gMapP == null) {
gMapP = new HashMap<>();
}
return gMapP.computeIfAbsent(ploidy, p -> getGenotypeReorderingMap(p, map));
}
}
/**
* Gets an array for reordering the positions of a format field with Number=G and ploidy=2.
*
* In those fields where there is a value per genotype, if we change the order of the alleles,
* the order of the genotypes will also change.
*
* The genotypes ordering is defined in the Vcf spec : https://samtools.github.io/hts-specs/VCFv4.3.pdf as "Genotype Ordering"
* Given a number of alleles N and a ploidy of P, the order algorithm is:
* for (a_p = 0; a_p < N: a_p++)
* for (a_p-1 = 0; a_p-1 <= a_p: a_p-1++)
* ...
* for (a_1 = 0; a_1 <= a_2: a_1++)
* print(a_1, a_2, ..., a_p)
*
* i.e:
* N=2,P=2: 00,01,11,02,12,22
* N=3,P=2: 00,01,11,02,12,22,03,13,23,33
*
* With P=2, given a genotype a/b, where a newA1) {
gMap[newA2 * (newA2 + 1) / 2 + newA1] = pos++;
} else {
gMap[newA1 * (newA1 + 1) / 2 + newA2] = pos++;
}
}
}
} else {
List originalOrdering = getOrdering(ploidy, map.length, null);
List reorderedOrdering = getOrdering(ploidy, map.length, map);
gMap = new int[originalOrdering.size()];
for (int i = 0; i < reorderedOrdering.size(); i++) {
gMap[i] = originalOrdering.indexOf(reorderedOrdering.get(i));
}
}
return gMap;
}
private int remapAlleleReverse(int a) {
return remapAllele(reverseMap, a);
}
private static int remapAlleleReverse(int[] map, int a) {
// Map does not contain reference (0). Reference never changes
if (a > 0) {
// Decrement. Map keys does not contain reference
a = ArrayUtils.indexOf(map, a - 1);
if (a >= 0) {
// Increment if not missing. Map values does not contain reference
a++;
}
}
return a;
}
private static int remapAllele(int[] map, int a) {
// Map does not contain reference (0). Reference never changes
if (a > 0) {
// Decrement. Map keys does not contain reference
a = map[a - 1];
if (a >= 0) {
// Increment if not missing. Map values does not contain reference
a++;
}
}
return a;
}
public static List getOrdering(int p, int n, int[] map) {
ArrayList list = new ArrayList<>();
getOrdering(p, n, list, new int[p], p - 1, map);
return list;
}
/**
* Recursive genotype ordering algorithm as seen in VCF-spec.
*
* @link https://samtools.github.io/hts-specs/VCFv4.3.pdf
*
* @param p ploidy
* @param n Number of alternates
* @param list List where to store the genotypes
* @param gt Shared array
* @param pos Position where to write in the array
* @param map Map from reordered allele position to original allele position
*/
public static void getOrdering(int p, int n, List list, int[] gt, int pos, int[] map) {
for (int a = 0; a <= n; a++) {
if (map == null) {
gt[pos] = a;
} else {
// Remap allele
gt[pos] = remapAllele(map, a);
}
if (p == 1) {
int prev = gt[0];
int[] gtSorted = gt;
// Check if sorted
for (int g : gt) {
if (prev > g) {
// Sort if needed. Do not modify original array
gtSorted = Arrays.copyOf(gt, gt.length);
Arrays.sort(gtSorted);
break;
}
}
StringBuilder sb = new StringBuilder();
for (int g : gtSorted) {
sb.append(g);
}
list.add(sb.toString());
} else {
getOrdering(p - 1, a, list, gt, pos - 1, map);
}
}
}
public String rearrange(String key, String data) {
return rearrange(key, data, null);
}
public String rearrange(String key, String data, @Nullable Integer ploidy) {
if (StringUtils.isEmpty(data) || data.equals(".")) {
// Do not rearrange missing values
return data;
}
Pair pair =
configuration.otherFieldsMap.getOrDefault(key, Pair.of(VCFHeaderLineType.String, VCFHeaderLineCount.UNBOUNDED));
VCFHeaderLineType type = pair.getLeft();
String missingValue = type.equals(VCFHeaderLineType.Float) || type.equals(VCFHeaderLineType.Integer) ? "0" : ".";
try {
switch (pair.getRight()) {
case A:
data = rearrangeNumberA(data, missingValue);
break;
case R:
data = rearrangeNumberR(data, missingValue);
break;
case G:
data = rearrangeNumberG(data, missingValue, ploidy);
break;
case INTEGER:
case UNBOUNDED:
default:
// Do not rearrange other fields
}
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Error rearranging key " + key + " = " + data, e);
}
return data;
}
public static class Configuration {
private Map> otherFieldsMap;
public Configuration() {
otherFieldsMap = new HashMap<>();
otherFieldsMap.put("AD", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.R));
otherFieldsMap.put("ADF", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.R));
otherFieldsMap.put("ADR", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.R));
otherFieldsMap.put("AF", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.A));
otherFieldsMap.put("AC", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.A));
otherFieldsMap.put("GL", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.G));
otherFieldsMap.put("GP", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.G));
otherFieldsMap.put("PL", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.G));
otherFieldsMap.put("CNL", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.G));
otherFieldsMap.put("CNP", Pair.of(VCFHeaderLineType.Integer, VCFHeaderLineCount.G));
}
public Configuration configure(VCFHeader header) {
configure(header.getInfoHeaderLines());
configure(header.getFormatHeaderLines());
return this;
}
public Configuration configure(Collection extends VCFHeaderLine> lines) {
lines.stream()
.filter(VCFCompoundHeaderLine.class::isInstance)
.map(VCFCompoundHeaderLine.class::cast)
.forEach(this::configure);
return this;
}
public Configuration configure(VCFCompoundHeaderLine line) {
this.otherFieldsMap.put(line.getID(), Pair.of(line.getType(), line.getCountType()));
return this;
}
public Configuration configure(String key, VCFHeaderLineCount number, VCFHeaderLineType type) {
this.otherFieldsMap.put(key, Pair.of(type, number));
return this;
}
public void configure(VariantFileHeader header) {
for (VariantFileHeaderComplexLine line : header.getComplexLines()) {
if (line.getKey().equalsIgnoreCase("FORMAT") || line.getKey().equalsIgnoreCase("INFO")) {
VCFHeaderLineCount number = VariantFileHeaderToVCFHeaderConverter.getVCFHeaderLineCount(line);
VCFHeaderLineType type = VariantFileHeaderToVCFHeaderConverter.getVCFHeaderLineType(line);
configure(line.getId(), number, type);
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy