org.opencb.biodata.tools.variant.merge.VariantMerger Maven / Gradle / Ivy
/*
*
*
*/
/**
*
*/
package org.opencb.biodata.tools.variant.merge;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.vcf.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.opencb.biodata.models.variant.Genotype;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantBuilder;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.biodata.models.variant.metadata.VariantFileHeader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* @author Matthias Haimel [email protected]
*
* TODO: Make this class inmutable. Remove all Atomicreferences
* TODO: Check for duplicated files
*/
public class VariantMerger {
private final Logger logger = LoggerFactory.getLogger(VariantMerger.class);
@Deprecated
public static final String VCF_FILTER = VCFConstants.GENOTYPE_FILTER_KEY;
public static final String GENOTYPE_FILTER_KEY = VCFConstants.GENOTYPE_FILTER_KEY;
public static final String GT_KEY = VCFConstants.GENOTYPE_KEY;
public static final String PASS_VALUE = "PASS";
public static final String DEFAULT_FILTER_VALUE = ".";
public static final String DEFAULT_MISSING_GT = Genotype.NOCALL;
private final AtomicReference gtKey = new AtomicReference<>();
private final AtomicReference filterKey = new AtomicReference<>();
private final AtomicReference annotationFilterKey = new AtomicReference<>();
private final boolean collapseDeletions;
private final LinkedHashMap expectedSamplesPosition = new LinkedHashMap<>();
private final Set expectedSamples = expectedSamplesPosition.keySet();
private final Map expectedFormatsPosition = new LinkedHashMap<>();
private final Map defaultValues = new ConcurrentHashMap<>();
private final AtomicReference studyId = new AtomicReference<>(null);
private final VariantAlternateRearranger.Configuration rearrangerConf = new VariantAlternateRearranger.Configuration();
public VariantMerger() {
this(false);
}
public VariantMerger(boolean collapseDeletions) {
this.gtKey.set(GT_KEY);
this.filterKey.set(GENOTYPE_FILTER_KEY);
this.annotationFilterKey.set(StudyEntry.FILTER);
setDefaultValue(getGtKey(), DEFAULT_MISSING_GT);
setDefaultValue(getFilterKey(), DEFAULT_FILTER_VALUE);
this.collapseDeletions = collapseDeletions;
}
public void setStudyId(String studyId) {
this.studyId.set(studyId);
}
private boolean hasStudyId() {
return this.studyId.get() != null;
}
private String getStudyId() {
return studyId.get();
}
public VariantMerger setExpectedFormats(List formats) {
expectedFormatsPosition.clear();
for (String format : formats) {
if (format.equals(getGtKey()) && !expectedFormatsPosition.isEmpty()) {
throw new IllegalArgumentException("Genotype format field [" + getGtKey() + "] must be in the first position!");
}
expectedFormatsPosition.put(format, expectedFormatsPosition.size());
}
return this;
}
/**
* Adds Sample names to the sample name set.
* Samples names are used to validate the completeness of a variant call.
* If a sample is not seen in the merged variant, the sample will be added as the registered default value.
* The default values are retrieved by {@link #getDefaultValue(String)} and set to {@link #DEFAULT_MISSING_GT} for GT_KEY.
*
* @param sampleNames Collection of sample names.
*/
public void addExpectedSamples(Collection sampleNames) {
sampleNames.forEach(sample -> expectedSamplesPosition.putIfAbsent(sample, expectedSamplesPosition.size()));
}
/**
* Calls {@link Set#clear()} before {@link #addExpectedSamples(Collection)}.
*
* @param sampleNames Collection of Sample names.
*/
public void setExpectedSamples(Collection sampleNames) {
this.expectedSamplesPosition.clear();
this.addExpectedSamples(sampleNames);
}
public Set getExpectedSamples() {
return Collections.unmodifiableSet(this.expectedSamples);
}
public String getGtKey() {
return this.gtKey.get();
}
public void setGtKey(String gtKey) {
updateDefaultKeys(this.gtKey.get(), gtKey);
this.gtKey.set(gtKey);
}
/**
* Update a key
* @param from
* @param to
* @return true if there was a value to be moved.
*/
public boolean updateDefaultKeys(String from, String to) {
if (null == from) {
return false;
}
if (null == to) {
return false;
}
if (StringUtils.equals(from, to)) {
return false;
}
String value = this.defaultValues.remove(from);
if (null == value) {
return false;
}
this.defaultValues.put(to, value);
return true;
}
/**
* Gets the default value of a key or {@link StringUtils#EMPTY} if no key is registered.
* @param key Key
* @return value Registered default value or {@link StringUtils#EMPTY}.
*/
public String getDefaultValue(String key) {
return this.defaultValues.getOrDefault(key, StringUtils.EMPTY);
}
public String getDefaultValue(String key, String valueIfNull) {
return this.defaultValues.getOrDefault(key, valueIfNull);
}
public void setDefaultValue(String key, String value) {
this.defaultValues.put(key, value);
}
public String getFilterKey() {
return this.filterKey.get();
}
public void setFilterKey(String filterKey) {
updateDefaultKeys(this.filterKey.get(), filterKey);
this.filterKey.set(filterKey);
}
public String getAnnotationFilterKey() {
return annotationFilterKey.get();
}
public void setAnnotationFilterKey(String annotationFilterKey) {
updateDefaultKeys(this.annotationFilterKey.get(), annotationFilterKey);
this.annotationFilterKey.set(annotationFilterKey);
}
public VariantMerger configure(VCFHeader header) {
rearrangerConf.configure(header);
return this;
}
public VariantMerger configure(Collection extends VCFHeaderLine> lines) {
rearrangerConf.configure(lines);
return this;
}
public VariantMerger configure(VCFCompoundHeaderLine line) {
rearrangerConf.configure(line);
return this;
}
public VariantMerger configure(String key, VCFHeaderLineCount number, VCFHeaderLineType type) {
rearrangerConf.configure(key, number, type);
return this;
}
public VariantMerger configure(VariantFileHeader header) {
rearrangerConf.configure(header);
return this;
}
/**
* Create and returns a new Variant using the target as a
* position template ONLY and merges the provided variants
* for this position. The target is not present in the
* merged output!!!
*
* @param template Template for position and study only
* @param load Variants to merge for position
* @return Variant new Variant object with merged information
*/
public Variant mergeNew(Variant template, Collection load) {
Variant current = createFromTemplate(template);
merge(current, load);
return current;
}
/**
* Create an empty Variant (position, ref, alt) from a template with basic Study information without samples.
* @param target Variant to take as a template
* @return Variant filled with chromosome, start, end, ref, alt, study ID and format set to GT only, BUT no samples.
*/
public Variant createFromTemplate(Variant target) {
Variant var = new Variant(target.getChromosome(), target.getStart(), target.getEnd(), target.getReference(), target.getAlternate());
var.setType(target.getType());
for(StudyEntry tse : target.getStudies()){
StudyEntry se = new StudyEntry(tse.getStudyId());
se.setFiles(Collections.singletonList(new FileEntry("", null, new HashMap<>())));
se.setSampleDataKeys(Arrays.asList(getGtKey(), getFilterKey()));
se.setSamplesPosition(new HashMap<>());
se.setSamples(new ArrayList<>());
var.addStudyEntry(se);
}
return var;
}
private void isValidVariant(Variant current) throws IllegalArgumentException{
if (current.getType().equals(VariantType.NO_VARIATION)) {
throw new IllegalStateException("Current variant can't be a NO_VARIANT");
}
// Validate variant information
// ensureGtFormat(current);
if (getStudy(current).getSampleDataKeys() == null || getStudy(current).getSampleDataKeys().isEmpty()) {
throw new IllegalArgumentException("Format of sample data is empty!!!!!!");
}
}
/**
* Calls {@link #merge(Variant, Collection)}
*
* @param current {@link Variant} to update.
* @param load {@link Variant} to be merged.
* @return Merged Variant object.
*/
public Variant merge(Variant current, Variant load) {
return merge(current, Collections.singleton(load));
}
/**
* Merge a collection of variants into one variant.
*
* @param current {@link Variant} to update with collection of variants. This object will be modified.
* @param load {@link Variant} to be merged.
* @return Modified {@link Variant} object (passed in as current.
*/
public Variant merge(Variant current, Collection load) {
isValidVariant(current);
// Build alt list
List>> loadAlts =
updateCollapseDeletions(current,
load.stream()
.map(v -> new MutablePair<>(v, buildAltList(v)))
.filter(p -> hasAnyOverlap(current, p.getLeft(), p.getRight()))
).collect(Collectors.toList());
mergeVariants(current, loadAlts);
return current;
}
public static boolean isDeletion(AlternateCoordinate alt) {
return isDeletion(alt.getType(), alt.getStart(), alt.getEnd());
}
public static boolean isDeletion(VariantType type, Integer start, Integer end) {
if (type.equals(VariantType.DELETION)) {
return true;
}
if (type.equals(VariantType.INDEL) && end >= start) {
return true;
}
return false;
}
public static boolean isInsertion(AlternateCoordinate alt) {
return isInsertion(alt.getType(), alt.getStart(), alt.getEnd());
}
public static boolean isInsertion(VariantType type, Integer start, Integer end) {
if (type.equals(VariantType.INSERTION)) {
return true;
}
if (type.equals(VariantType.INDEL) && end < start) {
return true;
}
return false;
}
private Stream>> updateCollapseDeletions(
Variant current,
Stream>> stream) {
if (this.collapseDeletions) {
AlternateCoordinate currAlt = buildAltList(current).get(0);
Integer start = current.getStart();
Integer end = current.getEnd();
Consumer updateAlt = a -> {
a.setStart(start);
a.setEnd(end);
a.setReference(current.getReference());
a.setAlternate(VariantBuilder.SPAN_DELETION); // set deletion to * Alternate
a.setType(VariantType.DELETION); // set all to the same
};
if (current.getType().equals(VariantType.SNP)
|| current.getType().equals(VariantType.SNV)
|| current.getType().equals(VariantType.MNV)) {
return stream.map(pair -> {
pair.getValue().stream()
.filter(a -> {
if (a.getType().equals(VariantType.INDEL)) {
return current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true);
}
return false; // Not for other SNPs
})
.forEach(updateAlt);
return pair;
});
} else if (isDeletion(current.getType(), start, end)) {
return stream.map(pair -> {
pair.getValue().stream()
.filter(a -> !a.equals(currAlt)) // not same as current variant
.filter(a -> start >= a.getStart() && end <= a.getEnd())
.forEach(updateAlt);
return pair;
});
} else if (isInsertion(current.getType(), start, end)) {
return stream.map(pair -> {
pair.getValue().stream()
.filter(a -> !a.equals(currAlt)) // not same as current variant
.filter(a -> {
if (isInsertion(a)) {
return (start.equals(a.getStart())
&& end.equals(a.getEnd())
&& a.getAlternate().length() >= currAlt.getAlternate().length()
);
}
return !a.getType().equals(VariantType.NO_VARIATION);
}
) // only longer insertions
.forEach(updateAlt);
return pair;
});
}
}
return stream;
}
public Variant merge(Variant current, List>> vatToAlts) {
isValidVariant(current);
// Filter alt list
List>> loadAlts = vatToAlts.stream()
.filter(p -> hasAnyOverlap(current, p.getLeft(), p.getRight()))
.collect(Collectors.toList());
mergeVariants(current, loadAlts);
return current;
}
public static boolean hasAnyOverlap(Variant current, Variant other) {
if (current.overlapWith(other, true)) {
return true;
}
// SecAlt of query
return other.getStudies().stream()
.filter( s -> // foreach study
s.getSecondaryAlternates().stream()
.filter(a -> {
// Avoid NPE
a = copyAlt(other, a);
return current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true);
}
)
.findAny()
.isPresent()
)
.findAny()
.isPresent();
}
public static boolean hasAnyOverlap(Variant current, Variant other, Collection alts) {
if (current.overlapWith(other, true)) {
return true; // Important for REF region
}
return alts.stream().filter(a -> current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true))
.findAny().isPresent();
}
private String variantToString(Variant v) {
StringBuilder sb = new StringBuilder(v.getChromosome());
sb.append(":").append(v.getStart()).append("-").append(v.getEnd());
sb.append(v.getReference().isEmpty() ? "-" : v.getReference());
sb.append(":").append(v.getAlternate().isEmpty() ? "-" : v.getAlternate()).append("[");
StudyEntry se = getStudy(v);
List samples = se.getSamples();
for(String sn : se.getSamplesName()){
Integer pos = se.getSamplesPosition().get(sn);
if (pos >= samples.size()) {
sb.append(sn).append(":S;");
} else if (null == samples.get(pos) || samples.get(pos).getData().size() < 1) {
sb.append(sn).append(":G;");
} else {
String gt = samples.get(pos).getData().get(0); // GT
sb.append(sn).append(":").append(gt).append(";");
}
}
sb.append("]");
return sb.toString();
}
/**
* Merge variants into current object.
* @param current Variant object - in/out
* @param varToAlts List of Variant with matching ALTs (ALT and secondary ALTs)
*/
private void mergeVariants(Variant current, List>> varToAlts) {
StudyEntry currentStudy = getStudy(current);
// Build ALT index
List altList = buildAltsList(current, varToAlts);
// Create a list of alternates as Integers to speed up the creation of the VariantAlternateRearranger
List altListHash = alternatesToHash(altList);
// Map altIdx = index(altList);
// Check if the number of secondary alternates has increased. If so, rearrange the current study (files + samples)
VariantAlternateRearranger currentStudyRearranger = null;
if (altList.size() - 1 != currentStudy.getSecondaryAlternates().size()) {
currentStudyRearranger = new VariantAlternateRearranger(alternatesToHash(buildAltList(current)), altListHash);
}
// Update SecALt list
currentStudy.setSecondaryAlternates(altList.subList(1, altList.size()));
// Find new formats
final Map newFormatPositions;
final List newFormat;
if (expectedFormatsPosition.isEmpty()) {
// Find all formats
Set allFormats = varToAlts.stream()
.map(Pair::getKey)
.map(this::getStudy)
.map(StudyEntry::getSampleDataKeys)
.flatMap(Collection::stream)
.collect(Collectors.toSet());
newFormatPositions = new HashMap<>(currentStudy.getSampleDataKeyPositions());
// newFormatPositions.putIfAbsent(getFilterKey(), newFormatPositions.size());
for (String format : allFormats) {
newFormatPositions.putIfAbsent(format, newFormatPositions.size());
}
newFormat = Arrays.asList(new String[newFormatPositions.size()]);
newFormatPositions.forEach((format, formatIdx) -> {
newFormat.set(formatIdx, format);
});
} else {
// TODO: Try to avoid this copy
newFormatPositions = new HashMap<>(expectedFormatsPosition);
newFormat = new ArrayList<>(expectedFormatsPosition.keySet());
}
Map extraFormats = newFormatPositions.entrySet()
.stream()
.filter(e -> !e.getKey().equals(getGtKey()) && !e.getKey().equals(getFilterKey()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
// Find new samples
LinkedHashMap newSamplesPosition;
if (expectedSamplesPosition.isEmpty()) {
newSamplesPosition = new LinkedHashMap<>(currentStudy.getSamplesPosition());
varToAlts.stream()
.map(Pair::getKey)
.map(this::getStudy)
.map(StudyEntry::getOrderedSamplesName)
.flatMap(List::stream)
.forEach(sample -> newSamplesPosition.putIfAbsent(sample, newSamplesPosition.size()));
} else {
// TODO: Try to avoid this copy
newSamplesPosition = new LinkedHashMap<>(expectedSamplesPosition);
}
Integer newGtIdx = newFormatPositions.get(getGtKey());
Integer newFilterIdx = newFormatPositions.get(getFilterKey());
// Create new Samples data
List newSamplesData = newSamples(newSamplesPosition.size(), newFormatPositions);
boolean[] alreadyMergedSamples = new boolean[newSamplesPosition.size()];
// Copy current samples data into new samples data
List currentSamples = currentStudy.getSamples();
// If FT is required, but missing in the input variant, get FILTER from the File Attributes.
String currentFilterValue = null;
if (!currentStudy.getSampleDataKeySet().contains(getFilterKey()) && newFormatPositions.containsKey(getFilterKey())) {
currentFilterValue = currentStudy.getFiles().isEmpty() ? getDefaultValue(getFilterKey())
: currentStudy.getFiles().get(0).getData().getOrDefault(getAnnotationFilterKey(), getDefaultValue(getFilterKey()));
}
int currentSampleIdx = 0;
for (String sample : currentStudy.getOrderedSamplesName()) {
SampleEntry currentSample = currentSamples.get(currentSampleIdx);
Integer newSampleIdx = newSamplesPosition.get(sample);
alreadyMergedSamples[newSampleIdx] = true;
List newSampleData = newSamplesData.get(newSampleIdx).getData();
int formatIdx = 0;
int ploidy = 2; // Default ploidy. Required for the rearranger
for (String format : currentStudy.getSampleDataKeys()) {
Integer newFormatIdx = newFormatPositions.get(format);
if (newFormatIdx != null) {
if (currentSample.getData().size() > formatIdx) {
String data = currentSample.getData().get(formatIdx);
if (currentStudyRearranger != null) {
if (format.equals(VCFConstants.GENOTYPE_KEY)) {
Genotype genotype = new Genotype(data);
ploidy = genotype.getPloidy();
data = currentStudyRearranger.rearrangeGenotype(genotype).toString();
} else {
data = currentStudyRearranger.rearrange(format, data, ploidy);
}
}
newSampleData.set(newFormatIdx, data);
} else {
newSampleData.set(newFormatIdx, getDefaultValue(format));
}
}
formatIdx++;
}
// If FT is required, but missing in the input variant, add FILTER from the File Attributes.
if (newFilterIdx != null && currentFilterValue != null) {
newSampleData.set(newFilterIdx, currentFilterValue);
}
currentSampleIdx++;
}
if (currentStudyRearranger != null) {
for (FileEntry file : currentStudy.getFiles()) {
file.getData().replaceAll(currentStudyRearranger::rearrange);
}
}
for (Pair> e : varToAlts) {
Variant other = e.getKey();
List otherAlternates = e.getValue();
// Map otherAltIdx = index(alternates).entrySet().stream()
// .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
final StudyEntry otherStudy = getStudy(other);
Map otherStudyFormatPositions = otherStudy.getSampleDataKeyPositions();
checkForDuplicates(current, other, currentStudy, otherStudy, otherAlternates);
VariantAlternateRearranger rearranger;
Map rearrangedGenotypesCache = new HashMap<>(5);
// It may happen that the new list of alternates does not contains some of the other alternates.
// In that case, use the rearranger
if (altList.size() == 1 && altList.equals(otherAlternates)) {
rearranger = null;
} else {
rearranger = new VariantAlternateRearranger(
alternatesToHash(otherAlternates),
altListHash, rearrangerConf);
}
// Add GT data for each sample to current Variant
List otherOrderedSamplesName = otherStudy.getOrderedSamplesName();
for (int sampleIdx = 0; sampleIdx < otherOrderedSamplesName.size(); sampleIdx++) {
String sampleName = otherOrderedSamplesName.get(sampleIdx);
List otherSampleData = otherStudy.getSamples().get(sampleIdx).getData();
Integer newSampleIdx = newSamplesPosition.get(sampleName);
List newSampleData = newSamplesData.get(newSampleIdx).getData();
boolean alreadyMergedSample = alreadyMergedSamples[newSampleIdx];
alreadyMergedSamples[newSampleIdx] = true;
// GT data
int ploidy = -1;
boolean isGtUpdated = false;
List updatedGtPositions = Collections.emptyList();
if (newGtIdx != null) {
String gt = otherSampleData.get(otherStudyFormatPositions.get(getGtKey()));
if (StringUtils.isBlank(gt)) {
throw new IllegalStateException(String.format(
"No GT [%s] found for sample %s in \nVariant: %s\nOtherSe:%s\nOtherSp:%s",
getGtKey(), sampleName, other.getImpl(), otherStudy.getSamples(),
otherStudy.getSamplesPosition()));
}
// Use a cache with rearranged genotypes to reuse rearranged genotypes
String updatedGt = rearrangedGenotypesCache.get(gt);
if (updatedGt == null) {
Genotype genotype;
if (rearranger != null) {
genotype = rearranger.rearrangeGenotype(new Genotype(gt));
} else {
genotype = new Genotype(gt);
}
if (!genotype.isPhased()) {
genotype.normalizeAllelesIdx();
}
if (collapseDeletions) {
int[] allelesIdx = genotype.getAllelesIdx();
for (int i = 0; i < allelesIdx.length; i++) {
if (allelesIdx[i] < 0) {
allelesIdx[i] = 0; // change to '0' for 'missing' reference (missing because change to '0' GT)
}
}
}
// updatedGt = updateGT(gt, altIdx, otherAltIdx);
updatedGt = genotype.toString();
rearrangedGenotypesCache.put(gt, updatedGt);
ploidy = genotype.getPloidy();
} else {
ploidy = Genotype.getPloidy(gt);
}
if (alreadyMergedSample) {
String currGT = newSampleData.get(newGtIdx);
List gtlst;
if (currGT.contains(",")) {
gtlst = new ArrayList<>(Arrays.asList(currGT.split(",")));
} else {
gtlst = new ArrayList<>(1);
gtlst.add(currGT);
}
if (!gtlst.contains(updatedGt)) {
gtlst.add(updatedGt);
updatedGtPositions = collapseGT(gtlst);
updatedGt = StringUtils.join(updatedGtPositions.stream()
.map(gtlst::get).collect(Collectors.toList()), ',');
isGtUpdated = true;
}
}
newSampleData.set(newGtIdx, updatedGt);
}
// Filter
if (newFilterIdx != null) {
Integer otherFilterIdx = otherStudyFormatPositions.get(getFilterKey());
String filter;
if (otherFilterIdx != null) {
filter = otherFilterIdx >= otherSampleData.size()
? getDefaultValue(getFilterKey())
: otherSampleData.get(otherFilterIdx);
} else {
filter = otherStudy.getFiles().get(0).getData()
.getOrDefault(StudyEntry.FILTER, getDefaultValue(getFilterKey()));
}
if (alreadyMergedSample && isGtUpdated) {
String currFilter = newSampleData.get(newFilterIdx);
if (currFilter != null && !currFilter.equals(getDefaultValue(getFilterKey()))) {
List filterLst = new ArrayList<>(Arrays.asList(currFilter.split(",")));
filterLst.add(filter);
if (/*Objects.isNull(sampleToGt)*/updatedGtPositions.isEmpty()) {
filter = StringUtils.join(filterLst, ',');
} else {
filter = StringUtils.join(updatedGtPositions.stream()
.map(filterLst::get).collect(Collectors.toList()), ',');
}
}
}
newSampleData.set(newFilterIdx, filter);
}
// Additional data
for (Map.Entry entry : extraFormats.entrySet()) {
Integer idx = otherStudyFormatPositions.get(entry.getKey());
String data = idx == null || idx >= otherSampleData.size() ? getDefaultValue(entry.getKey()) : otherSampleData.get(idx);
if (StringUtils.isNotEmpty(data)) {
if (rearranger != null) {
data = rearranger.rearrange(entry.getKey(), data, ploidy);
}
String old = newSampleData.set(entry.getValue(), data);
// if (old != null) {
// throw new IllegalStateException("TODO - merge additional formats!!!");
// }
}
}
}
mergeFile(current, other, rearranger, currentStudy, otherStudy);
}
currentStudy.setSamples(newSamplesData);
currentStudy.setSortedSamplesPosition(newSamplesPosition);
currentStudy.setSampleDataKeys(newFormat);
}
/**
* Create a list of alternates as Integers to speed up the creation of the VariantAlternateRearranger.
*
* The creation of the VariantAlternateRearranger makes a lot of comparitions among elements from the list of alternates.
* Creating a list of hashCodes from the alternates, speeds up significantly the creation of the rearrangers.
* @param alternates List of alternates
* @return List of hashCodes for each alternate
*/
private List alternatesToHash(List alternates) {
List list = new ArrayList<>(alternates.size());
for (AlternateCoordinate a : alternates) {
// list.add(Objects.hash(a.getChromosome(), a.getStart(), a.getEnd(), a.getReference(), a.getAlternate(), a.getType()));
int result = 1;
result = 31 * result + a.getChromosome().hashCode();
result = 31 * result + a.getStart().hashCode();
result = 31 * result + a.getEnd().hashCode();
result = 31 * result + a.getReference().hashCode();
result = 31 * result + a.getAlternate().hashCode();
result = 31 * result + a.getType().hashCode();
list.add(result);
}
return list;
}
private List newSamples(int samplesSize, Map formats) {
List newSamples;
List defaultSamplesData = Arrays.asList(new String[formats.size()]);
formats.forEach((format, formatIdx) -> defaultSamplesData.set(formatIdx, getDefaultValue(format)));
newSamples = new ArrayList<>(samplesSize);
for (int i = 0; i < samplesSize; i++) {
newSamples.add(new SampleEntry(null, null, new ArrayList<>(defaultSamplesData)));
}
return newSamples;
}
private List getMatchingPositions(List list, Predicate p){
List matching = new ArrayList<>();
for (int i = 0; i < list.size(); i++) {
if (p.test(list.get(i))) {
matching.add(i);
}
}
return matching;
}
/**
* Collapses a list of GT to a minimal set.
* @param gtsStr
* @return
*/
private List collapseGT(List gtsStr) {
if (gtsStr.isEmpty()) {
return Collections.emptyList();
}
if (gtsStr.size() == 1) {
return Collections.singletonList(0);
}
List gts = gtsStr.stream().map(Genotype::new).collect(Collectors.toList());
// only get GT with an ALT e.g 0/1 0/2 1/2 etc. (ignore ./. and 0/0 GT)
Predicate findAlts = gt -> Arrays.stream(gt.getAllelesIdx()).anyMatch(i -> i > 0);
Predicate findHomRef = gt -> Arrays.stream(gt.getAllelesIdx()).allMatch(i -> i == 0);
Predicate findOneRef = gt -> Arrays.stream(gt.getAllelesIdx()).anyMatch(i -> i == 0);
Predicate findNoCalls = gt -> Arrays.stream(gt.getAllelesIdx()).anyMatch(i -> i < 0);
List oneAltAllele = getMatchingPositions(gts, findAlts);
if (!oneAltAllele.isEmpty()) {
return oneAltAllele;
}
List reference = getMatchingPositions(gts, findHomRef);
if (!reference.isEmpty()) {
return reference;
}
List oneReferenceAllele = getMatchingPositions(gts, findOneRef);
if (!oneReferenceAllele.isEmpty()) {
return oneReferenceAllele;
}
// only no-calls left -> try to collapse
List nocalls = getMatchingPositions(gts, findNoCalls);
if (nocalls.size() == gtsStr.size()) { // all GT found
return Collections.singletonList(nocalls.get(0));
}
// don't know that could be left!!!
if (this.collapseDeletions) {
throw new IllegalStateException("Not able to resolve GT: " + StringUtils.join(gtsStr, ","));
}
return IntStream.range(0, gtsStr.size() - 1).boxed().collect(Collectors.toList());
}
private List buildAltsList(Variant current, List>> varToAlts) {
Integer start = current.getStart();
Integer end = current.getEnd();
final List currAlts = buildAltList(current);
final Set altSets = new HashSet<>(currAlts);
if (this.collapseDeletions && isDeletion(current.getType(), current.getStart(), current.getEnd())) {
// remove all alts that are NOT fully overlap current deletion -> keep only larger or same
varToAlts.stream()
.map(Pair::getValue)
.flatMap(Collection::stream)
.filter(a -> (start >= a.getStart() && end <= a.getEnd()))
.forEach(altSets::add);
} else if (this.collapseDeletions && isInsertion(current.getType(), current.getStart(), current.getEnd())) {
// remove all alts that are NOT fully overlap current deletion -> keep only larger or same
varToAlts.stream()
.map(Pair::getValue)
.flatMap(Collection::stream)
.filter(a -> {
if (isInsertion(a)) {
return (start.equals(a.getStart())
&& end.equals(a.getEnd())
&& (a.getAlternate().equals("*")
|| a.getAlternate().length() >= current.getAlternate().length())
);
}
return true;
})
.forEach(altSets::add);
} else if (this.collapseDeletions && current.getType().equals(VariantType.SNP)) {
varToAlts.forEach(p -> p.getValue().stream()
.filter(a -> current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true))
.forEach(altSets::add));
} else {
for (Pair> pair : varToAlts) {
Variant variant = pair.getKey();
List alts = pair.getValue();
if (variant.getType().equals(VariantType.NO_VARIATION) && !alts.isEmpty()) {
AlternateCoordinate mainAlternate = alts.get(0);
mainAlternate.setStart(Math.max(mainAlternate.getStart(), current.getStart()));
mainAlternate.setEnd(Math.min(mainAlternate.getEnd(), current.getEnd()));
}
altSets.addAll(alts);
}
}
// remove current alts
altSets.removeAll(currAlts);
currAlts.addAll(altSets);
for (Iterator iterator = currAlts.iterator(); iterator.hasNext();) {
AlternateCoordinate alt = iterator.next();
if (alt.getType().equals(VariantType.NO_VARIATION)) {
iterator.remove();
currAlts.add(alt);
break;
}
}
return currAlts;
}
@Deprecated
private Map index(List alts) {
Map altIdx = new HashMap<>();
AtomicInteger pos = new AtomicInteger(1); // Start at 1 -> first Alt genotype
for (AlternateCoordinate a : alts) {
altIdx.put(a, pos.getAndIncrement());
}
return altIdx;
}
private boolean checkForDuplicates(Variant current, Variant other, StudyEntry currentStudy, StudyEntry otherStudy, List otherAlts) {
Set duplicateSamples = otherStudy.getSamplesName().stream()
.filter(s -> currentStudy.getSamplesName().contains(s))
.collect(Collectors.toSet());
if (!duplicateSamples.isEmpty()) {
List currAlts = new ArrayList<>();
currAlts.add(getMainAlternate(current));
currAlts.addAll(currentStudy.getSecondaryAlternates());
for (String dupSample : duplicateSamples) {
String currGt = getStudy(current).getSampleData(dupSample, getGtKey());
Set gtIdxSet = Arrays.stream(currGt.split(","))
.flatMap(e -> Arrays.stream(e.split("/")))
.flatMap(e -> Arrays.stream(e.split("\\|")))
.filter(g -> NumberUtils.isNumber(g))
.map(g -> Integer.valueOf(g))
.filter(g -> g > 0)
.collect(Collectors.toSet());
// Find same alleles in current and other for this individual
List currConflict = gtIdxSet.stream()
.map(g -> currAlts.get(g - 1))
.filter(a -> otherAlts.stream().filter(o -> isSameVariant(a, o)).count() > 0)
.collect(Collectors.toList());
// Only Throw Exception in case of duplicated Alt for same individual
if (!currConflict.isEmpty()) {
StringBuilder sb = new StringBuilder("Duplicated entries during merging: Issue with ID ");
sb.append(dupSample).append("; Variants: ");
currConflict.forEach(a -> sb.append("\n").append(a));
sb.append(";");
throw new IllegalStateException(sb.toString());
}
}
}
return false;
}
protected void validateAlternate(AlternateCoordinate alt) {
if (alt.getChromosome() == null) {
throw new IllegalStateException("Chromosome of alt is null: " + alt);
}
if (alt.getStart() == null) {
throw new IllegalStateException("Start of alt is null: " + alt);
}
if (alt.getEnd() == null) {
throw new IllegalStateException("End of alt is null: " + alt);
}
if (alt.getReference() == null) {
throw new IllegalStateException("Reference of alt is null: " + alt);
}
if (alt.getAlternate() == null) {
throw new IllegalStateException("Alternate of alt is null: " + alt);
}
}
protected boolean equals(AlternateCoordinate alt1, AlternateCoordinate alt2) {
return alt2.getStart().equals(alt1.getStart())
&& alt2.getEnd().equals(alt1.getEnd())
&& alt2.getReference().equals(alt1.getReference())
&& alt2.getAlternate().equals(alt1.getAlternate());
}
/**
* Build a list of all the alternates from a variant. Includes the main and the secondary alternates.
* @param variant
* @return
*/
public List buildAltList(Variant variant) {
AlternateCoordinate mainAlternate = getMainAlternate(variant);
List alternates = new ArrayList<>();
boolean emptyRefBlock = mainAlternate.getType().equals(VariantType.NO_VARIATION)
&& (mainAlternate.getAlternate().isEmpty() || mainAlternate.getAlternate().equals(Allele.NO_CALL_STRING));
// Skip Reference Blocks (NO_VARIATION) where the alternate is empty
if (!emptyRefBlock) {
alternates.add(mainAlternate);
}
StudyEntry se = getStudy(variant);
if(se.getSecondaryAlternates() != null){
se.getSecondaryAlternates().forEach( alt -> alternates.add(copyAlt(variant, alt)));
}
return alternates;
}
private static AlternateCoordinate copyAlt(Variant var, AlternateCoordinate orig) {
AlternateCoordinate copy = new AlternateCoordinate();
copy.setChromosome(orig.getChromosome() == null ? var.getChromosome() : orig.getChromosome());
copy.setStart(orig.getStart() == null ? var.getStart() : orig.getStart());
copy.setEnd(orig.getEnd() == null ? var.getEnd() : orig.getEnd());
copy.setReference(orig.getReference() == null ? var.getReference() : orig.getReference());
copy.setAlternate(orig.getAlternate() == null ? var.getAlternate() : orig.getAlternate());
copy.setType(orig.getType() == null ? var.getType() : orig.getType());
return copy;
}
/**
* Get the variant as Alternate Coordinate.
*
* At this point, we don't care if the Alternate is SNP or SNV.
* In case that the variant is SV, recalculate the type, just in case the size has changed.
*
* @param variant Variant
* @return Variant as AlternateCoordinate
*/
public static AlternateCoordinate getMainAlternate(Variant variant) {
VariantType type;
switch (variant.getType()) {
case SNP:
type = VariantType.SNV;
break;
case MNP:
type = VariantType.MNV;
break;
case SV:
type = VariantBuilder.inferType(variant.getReference(), variant.getAlternate());
break;
default:
type = variant.getType();
}
return new AlternateCoordinate(variant.getChromosome(), variant.getStart(), variant.getEnd(),
variant.getReference(), variant.getAlternate(), type);
}
private void mergeFile(Variant current, Variant other, VariantAlternateRearranger rearranger,
StudyEntry currentStudy, StudyEntry otherStudy) {
List files = otherStudy.getFiles().stream()
.map(fileEntry -> FileEntry.newBuilder(fileEntry).build())
.collect(Collectors.toList());
if (!current.toString().equals(other.toString())) {
for (FileEntry file : files) {
if (file.getCall() != null) {
file.setCall(new OriginalCall(other.toString(), 0));
}
}
}
for (FileEntry file : files) {
for (Map.Entry entry : file.getData().entrySet()) {
String data = entry.getValue();
if (rearranger != null) {
data = rearranger.rearrange(entry.getKey(), data);
}
entry.setValue(data);
}
}
StudyEntry study = currentStudy;
try {
study.getFiles().addAll(files);
} catch (UnsupportedOperationException e) {
// If the files list was unmodifiable, clone the list and add.
study.setFiles(new LinkedList<>(study.getFiles()));
study.getFiles().addAll(files);
}
}
StudyEntry getStudy(Variant variant) {
if (hasStudyId()) {
StudyEntry study = variant.getStudy(getStudyId());
if (Objects.isNull(study)) {
throw new IllegalStateException("No study found for " + getStudyId());
}
return study;
}
return variant.getStudies().get(0);
}
static boolean onSameVariant (Variant a, Variant b){
return a.onSameRegion(b)
&& StringUtils.equals(a.getReference(), b.getReference())
&& StringUtils.equals(a.getAlternate(), b.getAlternate());
}
public static boolean isSameVariant(Variant a, AlternateCoordinate b){
return StringUtils.equals(a.getChromosome(), b.getChromosome())
&& a.getStart().equals(b.getStart())
&& a.getEnd().equals(b.getEnd())
&& StringUtils.equals(a.getReference(), b.getReference())
&& StringUtils.equals(a.getAlternate(), b.getAlternate());
}
public static boolean isSameVariant(AlternateCoordinate a, AlternateCoordinate b){
return StringUtils.equals(a.getChromosome(), b.getChromosome())
&& a.getStart().equals(b.getStart())
&& a.getEnd().equals(b.getEnd())
&& StringUtils.equals(a.getReference(), b.getReference())
&& StringUtils.equals(a.getAlternate(), b.getAlternate());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy