All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.biodata.tools.variant.merge.VariantMerger Maven / Gradle / Ivy

/*
 * 
 *
 */

/**
 *
 */
package org.opencb.biodata.tools.variant.merge;

import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.opencb.biodata.models.feature.Genotype;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * @author Matthias Haimel [email protected]
 *
 */
public class VariantMerger {
    private final Logger logger = LoggerFactory.getLogger(VariantMerger.class);

    @Deprecated
    public static final String VCF_FILTER = VCFConstants.GENOTYPE_FILTER_KEY;

    public static final String GENOTYPE_FILTER_KEY = VCFConstants.GENOTYPE_FILTER_KEY;

    public static final String GT_KEY = VCFConstants.GENOTYPE_KEY;
    public static final String PASS_VALUE = "PASS";
    public static final String DEFAULT_FILTER_VALUE = ".";
    public static final String DEFAULT_MISSING_GT = Genotype.NOCALL;

    private final AtomicReference gtKey = new AtomicReference<>();
    private final AtomicReference  filterKey = new AtomicReference<>();
    private final AtomicReference  annotationFilterKey = new AtomicReference<>();

    private final boolean collapseDeletions;
    private final Set expectedSamples = new ConcurrentSkipListSet<>();
    private final Map defaultValues = new ConcurrentHashMap<>();
    private final AtomicReference studyId = new AtomicReference<>(null);

    public VariantMerger() {
        this(false);
    }

    public VariantMerger(boolean collapseDeletions) {
        this.gtKey.set(GT_KEY);
        this.filterKey.set(GENOTYPE_FILTER_KEY);
        this.annotationFilterKey.set(StudyEntry.FILTER);

        setDefaultValue(getGtKey(), DEFAULT_MISSING_GT);
        setDefaultValue(getFilterKey(), DEFAULT_FILTER_VALUE);
        this.collapseDeletions = collapseDeletions;
    }

    public void setStudyId(String studyId) {
        this.studyId.set(studyId);
    }

    private boolean hasStudyId() {
        return this.studyId.get() != null;
    }

    private String getStudyId() {
        return studyId.get();
    }

    /**
     * Adds Sample names to the sample name set.
     * Samples names are used to validate the completeness of a variant call.
     * If a sample is not seen in the merged variant, the sample will be added as the registered default value.
     * The default values are retrieved by {@link #getDefaultValue(String)} and set to {@link #DEFAULT_MISSING_GT} for GT_KEY.
     * @param sampleNames Collection of sample names.
     */
    public void addExpectedSamples(Collection sampleNames) {
        this.expectedSamples.addAll(sampleNames);
    }

    /**
     * Calls {@link Set#clear()} before {@link #addExpectedSamples(Collection)}.
     * @param sampleNames Collection of Sample names.
     */
    public void setExpectedSamples(Collection sampleNames) {
        this.expectedSamples.clear();
        this.addExpectedSamples(sampleNames);
    }

    public Set getExpectedSamples(){
        return Collections.unmodifiableSet(this.expectedSamples);
    }

    public String getGtKey() {
        return this.gtKey.get();
    }

    public void setGtKey(String gtKey) {
        updateDefaultKeys(this.gtKey.get(), gtKey);
        this.gtKey.set(gtKey);
    }


    /**
     * Update a key
     * @param from
     * @param to
     * @return true if there was a value to be moved.
     */
    public boolean updateDefaultKeys(String from, String to) {
        if (null == from) {
            return false;
        }
        if (null == to) {
            return false;
        }
        if (StringUtils.equals(from, to)) {
            return false;
        }
        String value = this.defaultValues.remove(from);
        if (null == value) {
            return false;
        }
        this.defaultValues.put(to, value);
        return true;
    }

    /**
     * Gets the default value of a key or {@link StringUtils#EMPTY} if no key is registered.
     * @param key Key
     * @return value Registered default value or {@link StringUtils#EMPTY}.
     */
    public String getDefaultValue(String key) {
        return this.defaultValues.getOrDefault(key, StringUtils.EMPTY);
    }

    public String getDefaultValue(String key, String valueIfNull) {
        return this.defaultValues.getOrDefault(key, valueIfNull);
    }

    public void setDefaultValue(String key, String value) {
        this.defaultValues.put(key, value);
    }

    public String getFilterKey() {
        return this.filterKey.get();
    }

    public void setFilterKey(String filterKey) {
        updateDefaultKeys(this.filterKey.get(), filterKey);
        this.filterKey.set(filterKey);
    }

    public String getAnnotationFilterKey() {
        return annotationFilterKey.get();
    }

    public void setAnnotationFilterKey(String annotationFilterKey) {
        updateDefaultKeys(this.annotationFilterKey.get(), annotationFilterKey);
        this.annotationFilterKey.set(annotationFilterKey);
    }

    /**
     * Create and returns a new Variant using the target as a
     * position template ONLY and merges the provided variants
     * for this position.  The target is not present in the
     * merged output!!!
     *
     * @param template Template for position and study only
     * @param load Variants to merge for position
     * @return Variant new Variant object with merged information
     */
    public Variant mergeNew(Variant template,Collection load){
        Variant current = createFromTemplate(template);
        merge(current,load);
        return current;
    }

    /**
     * Create an empty Variant (position, ref, alt) from a template with basic Study information without samples.
     * @param target Variant to take as a template
     * @return Variant filled with chromosome, start, end, ref, alt, study ID and format set to GT only, BUT no samples.
     */
    public Variant createFromTemplate(Variant target) {
        Variant var = new Variant(target.getChromosome(), target.getStart(), target.getEnd(), target.getReference(), target.getAlternate());
        var.setType(target.getType());
        for(StudyEntry tse : target.getStudies()){
            StudyEntry se = new StudyEntry(tse.getStudyId());
            se.setFiles(Collections.singletonList(new FileEntry("", "", new HashMap<>())));
            se.setFormat(Arrays.asList(getGtKey(), getFilterKey()));
            se.setSamplesPosition(new HashMap<>());
            se.setSamplesData(new ArrayList<>());
            var.addStudyEntry(se);
        }
        return var;
    }

    private void isValidVariant(Variant current) throws IllegalArgumentException{
        if (current.getType().equals(VariantType.NO_VARIATION)) {
            throw new IllegalStateException("Current variant can't be a NO_VARIANT");
        }

        // Validate variant information
//        ensureGtFormat(current);
        if (getStudy(current).getFormat() == null || getStudy(current).getFormat().isEmpty()) {
            throw new IllegalArgumentException("Format of sample data is empty!!!!!!");
        }
    }

    /**
     * Calls {@link #merge(Variant, Collection)}
     * @param current {@link Variant} to update.
     * @param load {@link Variant} to be merged.
     * @return Merged Variant object.
     */
    Variant merge(Variant current, Variant load){
        return merge(current, Collections.singleton(load));
    }

    /**
     * Merge a collection of variants into one variant.
     * @param current {@link Variant} to update with collection of variants. This object will be modified.
     * @param load {@link Variant} to be merged.
     * @return Modified {@link Variant} object (passed in as current.
     */
    public Variant merge(Variant current, Collection load){
        isValidVariant(current);
        // Build alt list
        List>> loadAlts =
                updateCollapseDeletions(current,
                        load.stream()
                                .map(v -> new MutablePair<>(v, buildAltList(v)))
                                .filter(p -> hasAnyOverlap(current, p.getLeft(), p.getRight()))
                ).collect(Collectors.toList());

        mergeVariants(current, loadAlts);
        fillMissingGt(current);
        return current;
    }

    public static boolean isDeletion(AlternateCoordinate alt) {
        return isDeletion(alt.getType(), alt.getStart(), alt.getEnd());
    }

    public static boolean isDeletion(VariantType type, Integer start, Integer end) {
        if (type.equals(VariantType.DELETION)) {
            return true;
        }
        if (type.equals(VariantType.INDEL) && end >= start) {
            return true;
        }
        return false;
    }
    public static boolean isInsertion(AlternateCoordinate alt) {
        return isInsertion(alt.getType(), alt.getStart(), alt.getEnd());
    }

    public static boolean isInsertion(VariantType type, Integer start, Integer end) {
        if (type.equals(VariantType.INSERTION)) {
            return true;
        }
        if (type.equals(VariantType.INDEL) && end < start) {
            return true;
        }
        return false;
    }

    private Stream>> updateCollapseDeletions(
            Variant current,
            Stream>> stream) {
        if (this.collapseDeletions) {
            AlternateCoordinate currAlt = buildAltList(current).get(0);
            Integer start = current.getStart();
            Integer end = current.getEnd();
            Consumer updateAlt = a -> {
                a.setStart(start);
                a.setEnd(end);
                a.setReference(current.getReference());
                a.setAlternate("*"); // set deletion to * Alternate
                a.setType(VariantType.MIXED); // set all to the same
            };
            if (current.getType().equals(VariantType.SNP)
                    || current.getType().equals(VariantType.SNV)
                    || current.getType().equals(VariantType.MNV)) {
                return stream.map(pair -> {
                    pair.getValue().stream()
                            .filter(a -> {
                                if (a.getType().equals(VariantType.INDEL)) {
                                    return current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true);
                                }
                                return false; // Not for other SNPs
                            })
                            .forEach(updateAlt);
                    return pair;
                });
            } else if (isDeletion(current.getType(), start, end)) {
                return stream.map(pair -> {
                    pair.getValue().stream()
                            .filter(a -> !a.equals(currAlt)) // not same as current variant
                            .filter(a -> start >= a.getStart() && end <= a.getEnd())
                            .forEach(updateAlt);
                    return pair;
                });
            } else if (isInsertion(current.getType(), start, end)) {
                return stream.map(pair -> {
                    pair.getValue().stream()
                            .filter(a -> !a.equals(currAlt)) // not same as current variant
                            .filter(a -> {
                                        if (isInsertion(a)) {
                                            return (start.equals(a.getStart())
                                                    && end.equals(a.getEnd())
                                                    && a.getAlternate().length() >= currAlt.getAlternate().length()
                                            );
                                        }
                                        return !a.getType().equals(VariantType.NO_VARIATION);
                                    }
                            ) // only longer insertions
                            .forEach(updateAlt);
                    return pair;
                });
            }
        }
        return stream;
    }

    private void fillMissingGt(Variant variant){
        if (this.getExpectedSamples().isEmpty()) {
            return; // Nothing to do.
        }
        Set missing = new HashSet<>(getExpectedSamples());
        StudyEntry study = getStudy(variant);
        missing.removeAll(study.getSamplesName());
        if (missing.isEmpty()) {
            return; // Nothing to do.
        }
        // Prepare one data template for all missing samples.
        Map formatPositions = study.getFormatPositions();
        String[] dataList = new String[formatPositions.size()];
        Arrays.fill(dataList, StringUtils.EMPTY);
        dataList[formatPositions.get(getGtKey())] = getDefaultValue(getGtKey()); //set Missing GT
        if (formatPositions.containsKey(getFilterKey())) {
            dataList[formatPositions.get(getFilterKey())] = getDefaultValue(getFilterKey());
        }
        // Register template for all missing samples.
        missing.forEach(sample -> study.addSampleData(sample, new ArrayList(Arrays.asList(dataList))));
    }

    public Variant merge(Variant current, List>> vatToAlts) {
        isValidVariant(current);
        // Filter alt list
        List>> loadAlts = vatToAlts.stream()
                .filter(p -> hasAnyOverlap(current, p.getLeft(), p.getRight()))
                .collect(Collectors.toList());
        mergeVariants(current, loadAlts);
        return current;
    }

    public static boolean hasAnyOverlap(Variant current, Variant other) {
        if (current.overlapWith(other, true)) {
            return true;
        }
        // SecAlt of query
        return other.getStudies().stream()
                .filter( s -> // foreach study
                        s.getSecondaryAlternates().stream()
                                .filter(a -> {
                                            // Avoid NPE
                                            a = copyAlt(other, a);
                                            return current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true);
                                        }
                                )
                                .findAny()
                                .isPresent()
                )
                .findAny()
                .isPresent();
    }

    public static boolean hasAnyOverlap(Variant current, Variant other, Collection alts) {
        if (current.overlapWith(other, true)) {
            return true; // Important for REF region
        }
        return alts.stream().filter(a -> current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true))
                .findAny().isPresent();
    }

    private String variantToString(Variant v) {
        StringBuilder sb = new StringBuilder(v.getChromosome());
        sb.append(":").append(v.getStart()).append("-").append(v.getEnd());
        sb.append(v.getReference().isEmpty() ? "-" : v.getReference());
        sb.append(":").append(v.getAlternate().isEmpty() ? "-" : v.getAlternate()).append("[");
        StudyEntry se = v.getStudies().get(0);
        List> sd = se.getSamplesData();
        for(String sn : se.getSamplesName()){
            Integer pos = se.getSamplesPosition().get(sn);
            if (pos >= sd.size()) {
                sb.append(sn).append(":S;");
            } else if (null == sd.get(pos) || sd.get(pos).size() < 1) {
                sb.append(sn).append(":G;");
            } else {
                String gt = sd.get(pos).get(0); // GT
                sb.append(sn).append(":").append(gt).append(";");
            }
        }
        sb.append("]");
        return sb.toString();
    }

    /**
     * Merge variants into current object.
     * @param current Variant object - in/out
     * @param varToAlts List of Variant with matching ALTs (ALT and secondary ALTs)
     */
    void mergeVariants(Variant current, List>> varToAlts) {
        StudyEntry currentStudy = getStudy(current);
        String defaultFilterValue = currentStudy.getFiles().isEmpty() ? getDefaultValue(getFilterKey())
                : currentStudy.getFiles().get(0).getAttributes().getOrDefault(getAnnotationFilterKey(), getDefaultValue(getFilterKey()));
        ensureFormat(currentStudy, getFilterKey(), defaultFilterValue);

        final List orderedSamplesName = new ArrayList<>(currentStudy.getOrderedSamplesName());
        final Set currSampleNames = new HashSet<>(currentStudy.getSamplesName());

        // Build ALT index
        List altList = buildAltsList(current, varToAlts.stream().map(Pair::getRight).collect(Collectors.toList()));
        Map altIdx = index(altList);

        // Update SecALt list
        currentStudy.setSecondaryAlternates(altList.subList(1,altList.size()));
        final Map formatPositions = new HashMap<>(currentStudy.getFormatPositions());
        final Map additionalFormats = new HashMap<>(formatPositions);
        additionalFormats.remove(getGtKey());
        additionalFormats.remove(getFilterKey());

        final Map sampleToGt;
        if (formatPositions.keySet().contains(getGtKey())) {
            if (!(formatPositions.get(getGtKey()).equals(0))) {
                throw new IllegalStateException("Current study expected to be in order of 'GT'");
            }
            sampleToGt = sampleToGt(current);
        } else {
            sampleToGt  = null;
        }
        final Map sampleToFilter = sampleToSampleData(current, getFilterKey());
        final Map> sampleToAdditional = sampleToAdditionalData(additionalFormats, current);

        varToAlts.forEach(e -> {
            Variant other = e.getKey();
            Map otherAltIdx = index(e.getValue()).entrySet().stream()
                    .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
            final StudyEntry otherStudy = getStudy(other);
            final Map otherSampleToGt = sampleToGt(other);
            final Map otherSampleToFilter = otherStudy.getFormat().contains(getFilterKey())
                    ? sampleToSampleData(other, getFilterKey())
                    : sampleToAttribute(other, getAnnotationFilterKey());

            final Map> otherSampleToAdditionalFormats = sampleToAdditionalData(additionalFormats, other);

            checkForDuplicates(current, other, currentStudy, otherStudy, e.getValue());

            // Add GT data for each sample to current Variant
            otherStudy.getOrderedSamplesName().forEach(sampleName -> {
                boolean alreadyInStudy = currSampleNames.contains(sampleName);
                if (!alreadyInStudy) {
                    orderedSamplesName.add(sampleName);
                    currSampleNames.add(sampleName);
                }

                // GT data
                boolean isGtUpdated = false;
                List updatedGtPositions = Collections.emptyList();
                if (sampleToGt != null) {
                    String gt = otherSampleToGt.get(sampleName);
                    if (StringUtils.isBlank(gt)) {
                        throw new IllegalStateException(String.format(
                                "No GT [%s] found for sample %s in \nVariant: %s\nIndexOther:%s\nIndex:%s\nOtherSe:%s\nOtherSp:%s",
                                getGtKey(), sampleName, other.getImpl(), otherSampleToGt, sampleToGt, otherStudy.getSamplesData(),
                                otherStudy.getSamplesPosition()));
                    }
                    String updatedGt = updateGT(gt, altIdx, otherAltIdx);
                    if (alreadyInStudy) {
                        String currGT = sampleToGt.get(sampleName);
                        List gtlst = new ArrayList<>(Arrays.asList(currGT.split(",")));
                        if (!gtlst.contains(updatedGt)) {
                            gtlst.add(updatedGt);
                            updatedGtPositions = collapseGT(gtlst);
                            updatedGt = StringUtils.join(updatedGtPositions.stream()
                                    .map(p -> gtlst.get(p)).collect(Collectors.toList()), ',');
                            isGtUpdated = true;
                        }
                        sampleToGt.put(sampleName, updatedGt);
                    }
                    sampleToGt.put(sampleName, updatedGt);
                }

                // Filter
                String filter = otherSampleToFilter.getOrDefault(sampleName, getDefaultValue(getFilterKey()));
                if (alreadyInStudy && isGtUpdated) {
                    String currFilter = sampleToFilter.get(sampleName);
                    List filterLst = new ArrayList<>(Arrays.asList(currFilter.split(",")));
                    filterLst.add(filter);
                    if (Objects.isNull(sampleToGt)) {
                        filter = StringUtils.join(filterLst, ',');
                    } else {
                        filter = StringUtils.join(updatedGtPositions.stream()
                                .map(p -> filterLst.get(p)).collect(Collectors.toList()), ',');
                    }
                }
                sampleToFilter.put(sampleName, filter);

                // Additional data
                Map otherAdditional = otherSampleToAdditionalFormats.get(sampleName);
                if (otherAdditional != null) {
                    if (!sampleToAdditional.containsKey(sampleName)) {
                        sampleToAdditional.put(sampleName, otherAdditional);
                    } else {
                        throw new IllegalStateException("TODO - merge additional formats!!!");
                    }
                }
            });
            mergeFiles(current, other, currentStudy, otherStudy);
        });
        updateStudy(currentStudy, sampleToGt, sampleToFilter, sampleToAdditional, orderedSamplesName);
    }

    /**
     * Update study with Annotations.
     * @param study
     * @param sampleToGt
     * @param sampleToFilter
     * @param sampleToAdditional
     * @param orderedSamplesName
     */
    private void updateStudy(StudyEntry study, Map sampleToGt, Map sampleToFilter,
                             Map> sampleToAdditional, List orderedSamplesName) {

        // Build empty sample data.
        List format = study.getFormat();
        String[] formatTemplate = new String[format.size()];
        Arrays.fill(formatTemplate, StringUtils.EMPTY);

        // Build samples position
        LinkedHashMap samplesPosition = new LinkedHashMap<>();
        int sampleSize = orderedSamplesName.size();
        for (int i = 0; i < sampleSize; i++) {
            samplesPosition.put(orderedSamplesName.get(i), i);
        }

        // Init samples data with empty string.
        List> samplesData = new ArrayList<>(sampleSize);
        for (int i = 0; i < sampleSize; ++i) {
            samplesData.add(new ArrayList<>(Arrays.asList(formatTemplate)));
        }

        // Add genotypes, filer
        for (int pos = 0; pos < format.size(); pos++) {
            String currFormat = format.get(pos);
            if (StringUtils.equals(currFormat, getGtKey())) {
                for (Map.Entry entry : sampleToGt.entrySet()) {
                    samplesData.get(samplesPosition.get(entry.getKey())).set(pos, entry.getValue());
                }
            } else if (StringUtils.equals(currFormat, getFilterKey())) {
                for (Map.Entry entry : sampleToFilter.entrySet()) {
                    samplesData.get(samplesPosition.get(entry.getKey())).set(pos, entry.getValue());
                }
            }
        }
        // and additional values
        sampleToAdditional.forEach((sample, m) -> m.forEach((pos, val) -> samplesData.get(samplesPosition.get(sample)).set(pos, val)));

//        for (int i = 0; i < samplesData.size(); i++) {
//            if (null == samplesData.get(i)) {
//                throw new IllegalStateException("Position " + i + " of " + samplesData.size() + " not filled!!!");
//            }
//        }
        study.setSamplesPosition(samplesPosition);
        study.setSamplesData(samplesData);
    }

    private  List getMatchingPositions(List list, Predicate p){
        List matching = new ArrayList<>();
        for (int i = 0; i < list.size(); i++) {
            if (p.test(list.get(i))) {
                matching.add(i);
            }
        }
        return matching;
    }

    /**
     * Collapses a list of GT to a minimal set.
     * @param gtsStr
     * @return
     */
    private List collapseGT(List gtsStr) {
        if (gtsStr.isEmpty()) {
            return Collections.emptyList();
        }
        if (gtsStr.size() == 1) {
            return Collections.singletonList(0);
        }

        List gts = gtsStr.stream().map(Genotype::new).collect(Collectors.toList());

        // only get GT with an ALT e.g 0/1 0/2 1/2 etc. (ignore ./. and 0/0 GT)
        Predicate findAlts = gt -> Arrays.stream(gt.getAllelesIdx()).anyMatch(i -> i > 0);
        Predicate findHomRef = gt -> Arrays.stream(gt.getAllelesIdx()).allMatch(i -> i == 0);
        Predicate findOneRef = gt -> Arrays.stream(gt.getAllelesIdx()).anyMatch(i -> i == 0);
        Predicate findNoCalls = gt -> Arrays.stream(gt.getAllelesIdx()).anyMatch(i -> i < 0);

        List oneAltAllele = getMatchingPositions(gts, findAlts);
        if (!oneAltAllele.isEmpty()) {
            return oneAltAllele;
        }
        List reference = getMatchingPositions(gts, findHomRef);
        if (!reference.isEmpty()) {
            return reference;
        }

        List oneReferenceAllele = getMatchingPositions(gts, findOneRef);
        if (!oneReferenceAllele.isEmpty()) {
            return oneReferenceAllele;
        }
        // only no-calls left -> try to collapse
        List nocalls = getMatchingPositions(gts, findNoCalls);
        if (nocalls.size() == gtsStr.size()) { // all GT found
            return Collections.singletonList(nocalls.get(0));
        }
        // don't know that could be left!!!
        if (this.collapseDeletions) {
            throw new IllegalStateException("Not able to resolve GT: " + StringUtils.join(gtsStr, ","));
        }
        return IntStream.range(0, gtsStr.size() - 1).boxed().collect(Collectors.toList());
    }

    private String updateGT(String gt, Map curr, Map other) {
        Genotype gto = new Genotype(gt);
        int[] idx = gto.getAllelesIdx();
        int len = idx.length;
        IntStream.range(0, len).boxed().filter(i -> idx[i] > 0 && idx[i] <= other.size())
                .forEach(i -> {
                    Integer allele = curr.get(other.get(idx[i]));
                    if (this.collapseDeletions && Objects.isNull(allele)) {
                        allele = 0; // change to '0' for 'missing' reference (missing because change to '0' GT)
                    }
                    gto.updateAlleleIdx(i, allele);
                });
        if (!gto.isPhased()) {
            Arrays.sort(idx);
        }
        return gto.toGenotypeString();
    }

    private List buildAltsList (Variant current, Collection> alts) {
        Integer start = current.getStart();
        Integer end = current.getEnd();
        final List currAlts = buildAltList(current);
        final Set altSets = new HashSet<>(currAlts);
        if (this.collapseDeletions && isDeletion(current.getType(), current.getStart(), current.getEnd())) {
            // remove all alts that are NOT fully overlap current deletion -> keep only larger or same
            alts.stream()
                    .flatMap(Collection::stream)
                    .filter(a -> (start >= a.getStart() && end <= a.getEnd()))
                    .forEach(altSets::add);
        } else if (this.collapseDeletions && isInsertion(current.getType(), current.getStart(), current.getEnd())) {
            // remove all alts that are NOT fully overlap current deletion -> keep only larger or same
            alts.stream()
                    .flatMap(Collection::stream)
                    .filter(a -> {
                        if (isInsertion(a)) {
                            return (start.equals(a.getStart())
                                    && end.equals(a.getEnd())
                                    && (a.getAlternate().equals("*")
                                    || a.getAlternate().length() >= current.getAlternate().length())
                            );
                        }
                        return true;
                    })
                    .forEach(altSets::add);
        } else if (this.collapseDeletions && current.getType().equals(VariantType.SNP)) {
            alts.forEach(l -> l.stream()
                    .filter(a ->current.overlapWith(a.getChromosome(), a.getStart(), a.getEnd(), true))
                    .forEach(altSets::add));
        } else {
            alts.forEach(altSets::addAll);
        }
        // remove current alts
        altSets.removeAll(currAlts);
        currAlts.addAll(altSets);
        return currAlts;
    }

    private Map index(List alts) {
        Map altIdx = new HashMap<>();
        AtomicInteger pos = new AtomicInteger(1); // Start at 1 -> first Alt genotype
        for (AlternateCoordinate a : alts) {
            altIdx.put(a, pos.getAndIncrement());
        }
        return altIdx;
    }

    private boolean checkForDuplicates(Variant current, Variant other, StudyEntry currentStudy, StudyEntry otherStudy, List otherAlts) {
        Set duplicateSamples = otherStudy.getSamplesName().stream()
                .filter(s -> currentStudy.getSamplesName().contains(s))
                .collect(Collectors.toSet());
        if (!duplicateSamples.isEmpty()) {
            Map currSampleToGts = sampleToGt(current);
            List currAlts = new ArrayList<>();
            currAlts.add(getMainAlternate(current));
            currAlts.addAll(currentStudy.getSecondaryAlternates());
            for (String dupSample : duplicateSamples) {
                String currGt = currSampleToGts.get(dupSample);
                Set gtIdxSet = Arrays.stream(currGt.split(","))
                        .flatMap(e -> Arrays.stream(e.split("/")))
                        .flatMap(e -> Arrays.stream(e.split("\\|")))
                        .filter(g -> NumberUtils.isNumber(g))
                        .map(g -> Integer.valueOf(g))
                        .filter(g -> g > 0)
                        .collect(Collectors.toSet());
                // Find same alleles in current and other for this individual
                List currConflict = gtIdxSet.stream()
                        .map(g -> currAlts.get(g - 1))
                        .filter(a -> otherAlts.stream().filter(o -> isSameVariant(a, o)).count() > 0)
                        .collect(Collectors.toList());
                // Only Throw Exception in case of duplicated Alt for same individual
                if (!currConflict.isEmpty()) {
                    StringBuilder sb = new StringBuilder("Duplicated entries during merging: Issue with ID ");
                    sb.append(dupSample).append("; Variants: ");
                    currConflict.forEach(a -> sb.append("\n").append(a));
                    sb.append(";");
                    throw new IllegalStateException(sb.toString());
                }
            }
        }
        return false;
    }


    protected void validateAlternate(AlternateCoordinate alt) {
        if (alt.getChromosome() == null) {
            throw new IllegalStateException("Chromosome of alt is null: " + alt);
        }
        if (alt.getStart() == null) {
            throw new IllegalStateException("Start of alt is null: " + alt);
        }
        if (alt.getEnd() == null) {
            throw new IllegalStateException("End of alt is null: " + alt);
        }
        if (alt.getReference() == null) {
            throw new IllegalStateException("Reference of alt is null: " + alt);
        }
        if (alt.getAlternate() == null) {
            throw new IllegalStateException("Alternate of alt is null: " + alt);
        }
    }

    protected boolean equals(AlternateCoordinate alt1, AlternateCoordinate alt2) {
        return getStart(alt2).equals(getStart(alt1)) && getEnd(alt2).equals(getEnd(alt1))
                && getReference(alt2).equals(getReference(alt1))
                && getAlternate(alt2).equals(getAlternate(alt1));
    }

    private String getReference(AlternateCoordinate s) {
        return s.getReference();
    }

    private String getAlternate(AlternateCoordinate s) {
        return s.getAlternate();
    }

    private Integer getStart(AlternateCoordinate s) {
        return s.getStart();
    }

    private Integer getEnd(AlternateCoordinate s) {
        return s.getEnd();
    }


    /**
     * Build a list of all the alternates from a variant. Includes the main and the secondary alternates.
     * @param variant
     * @return
     */
    public List buildAltList(Variant variant) {
        AlternateCoordinate mainAlternate = getMainAlternate(variant);
        List alternates = new ArrayList<>();
        if (!mainAlternate.getType().equals(VariantType.NO_VARIATION)) {
            alternates.add(mainAlternate);
        }
        StudyEntry se = getStudy(variant);
        if(se.getSecondaryAlternates() != null){
            se.getSecondaryAlternates().forEach( alt -> alternates.add(copyAlt(variant, alt)));
        }
        return alternates;
    }

    private static AlternateCoordinate copyAlt(Variant var, AlternateCoordinate orig) {
        AlternateCoordinate copy = new AlternateCoordinate();
        copy.setChromosome(orig.getChromosome() == null ? var.getChromosome() : orig.getChromosome());
        copy.setStart(orig.getStart() == null ? var.getStart() : orig.getStart());
        copy.setEnd(orig.getEnd() == null ? var.getEnd() : orig.getEnd());
        copy.setReference(orig.getReference() == null ? var.getReference() : orig.getReference());
        copy.setAlternate(orig.getAlternate() == null ? var.getAlternate() : orig.getAlternate());
        copy.setType(orig.getType() == null ? var.getType() : orig.getType());
        return copy;
    }

    /**
     * Get the variant as Alternate Coordinate.
     *
     * At this point, we don't care if the Alternate is SNP or SNV.
     * In case that the variant is SV, recalculate the type, just in case the size has changed.
     *
     * @param variant Variant
     * @return Variant as AlternateCoordinate
     */
    public static AlternateCoordinate getMainAlternate(Variant variant) {
        VariantType type;
        switch (variant.getType()) {
            case SNP:
                type = VariantType.SNV;
                break;
            case MNP:
                type = VariantType.MNV;
                break;
            case SV:
                type = Variant.inferType(variant.getReference(), variant.getAlternate(), variant.getLength());
                break;
            default:
                type = variant.getType();

        }
        return new AlternateCoordinate(variant.getChromosome(), variant.getStart(), variant.getEnd(),
                variant.getReference(), variant.getAlternate(), type);
    }

    private void mergeFiles(Variant current, Variant other, StudyEntry currentStudy, StudyEntry otherStudy) {
        String call = other.getStart() + ":" + other.getReference() + ":" + other.getAlternate() + ":0";

        List files = otherStudy.getFiles().stream().map(fileEntry -> FileEntry.newBuilder(fileEntry).build()).collect(Collectors.toList());
        if (!current.toString().equals(other.toString())) {
            for (FileEntry file : files) {
                if (file.getCall() == null || file.getCall().isEmpty()) {
                    file.setCall(call);
                }
            }
        }
        StudyEntry study = currentStudy;
        try {
            study.getFiles().addAll(files);
        } catch (UnsupportedOperationException e) {
            // If the files list was unmodifiable, clone the list and add.
            study.setFiles(new LinkedList<>(study.getFiles()));
            study.getFiles().addAll(files);
        }
    }

    private Map sampleToAttribute(Variant var, String key) {
        StudyEntry se = getStudy(var);
        String value = se.getFiles().get(0).getAttributes().getOrDefault(key, "");
        if (StringUtils.isBlank(value)) {
            return Collections.emptyMap();
        }
        return se.getSamplesName().stream().collect(Collectors.toMap(e -> e, e -> value));
    }

    private Map sampleToGt(Variant load) {
        return sampleToSampleData(load, getGtKey());
    }

    private Map> sampleToAdditionalData(Map additionalFormat, Variant var) {
        Map> sampleToAdditional = new HashMap<>();
        additionalFormat.forEach((key, idx) -> {
            Map sampleToValue = sampleToSampleData(var, key);
            sampleToValue.forEach((sample, value) -> {
                Map map = sampleToAdditional.get(sample);
                if (map == null) {
                    map = new HashMap<>();
                    sampleToAdditional.put(sample, map);
                }
                map.put(idx, value);
            });
        });
        return sampleToAdditional;
    }

    private Map sampleToSampleData(Variant var, String key){
        Map retMap = new HashMap<>();
        StudyEntry se = getStudy(var);
        LinkedHashMap samplesPosition = se.getSamplesPosition();
        List> samplesData = se.getSamplesData();
        if (Objects.isNull(samplesData)) {
            throw new IllegalStateException("No samplesData retrieved: " + var.getImpl());
        }
        Map formatPositions = se.getFormatPositions();
        Integer formatPosition = formatPositions.get(key);
        String defaultValue = getDefaultValue(key);

        samplesPosition.forEach((s,samplePosition) -> {
            List values = samplesData.get(samplePosition);
            if (Objects.isNull(values)) {
                throw new IllegalStateException("No values for sample " + s + " at position " + samplePosition + " in " + samplesData);
            }
            if (Objects.isNull(formatPosition)) {
                retMap.put(s, defaultValue);
            } else {
                String val = values.get(formatPosition);
                if (StringUtils.isNotBlank(val)) {
                    retMap.put(s, val);
                }
            }
        });
        return retMap;
    }

    StudyEntry getStudy(Variant variant) {
        if (hasStudyId()) {
            StudyEntry study = variant.getStudy(getStudyId());
            if (Objects.isNull(study)) {
                throw new IllegalStateException("No study found for " + getStudyId());
            }
            return study;
        }
        return variant.getStudies().get(0);
    }

    /**
     * Ensures that all the samples contains the required format value.
     * @param studyEntry
     * @param formatValue
     * @param defaultValue
     */
    private void ensureFormat(StudyEntry studyEntry, String formatValue, String defaultValue) {
        if (!studyEntry.getFormat().contains(formatValue)) {
            studyEntry.addFormat(formatValue);
            if (studyEntry.getSamplesData() != null && !studyEntry.getSamplesData().isEmpty()) {
                for (String sampleName : studyEntry.getOrderedSamplesName()) {
                    studyEntry.addSampleData(sampleName, getFilterKey(), defaultValue);
                }
            }
        }
    }

    static boolean onSameVariant (Variant a, Variant b){
        return a.onSameRegion(b)
                && StringUtils.equals(a.getReference(), b.getReference())
                && StringUtils.equals(a.getAlternate(), b.getAlternate());
    }

    public static boolean isSameVariant(Variant a, AlternateCoordinate b){
        return StringUtils.equals(a.getChromosome(), b.getChromosome())
                && a.getStart().equals(b.getStart())
                && a.getEnd().equals(b.getEnd())
                && StringUtils.equals(a.getReference(), b.getReference())
                && StringUtils.equals(a.getAlternate(), b.getAlternate());
    }

    public static boolean isSameVariant(AlternateCoordinate a, AlternateCoordinate b){
        return StringUtils.equals(a.getChromosome(), b.getChromosome())
                && a.getStart().equals(b.getStart())
                && a.getEnd().equals(b.getEnd())
                && StringUtils.equals(a.getReference(), b.getReference())
                && StringUtils.equals(a.getAlternate(), b.getAlternate());
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy