All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.biodata.tools.variant.VariantNormalizer Maven / Gradle / Ivy

/*
 * 
 *
 */

package org.opencb.biodata.tools.variant;

import htsjdk.variant.vcf.*;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.biojava.nbio.alignment.Alignments;
import org.biojava.nbio.alignment.SimpleGapPenalty;
import org.biojava.nbio.alignment.SubstitutionMatrixHelper;
import org.biojava.nbio.alignment.template.SequencePair;
import org.biojava.nbio.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.compound.AmbiguityDNACompoundSet;
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
import org.opencb.biodata.models.feature.Genotype;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField;
import org.opencb.biodata.tools.variant.exceptions.VariantNormalizerException;
import org.opencb.biodata.tools.variant.merge.VariantAlternateRearranger;
import org.opencb.commons.run.ParallelTaskRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
 * Created on 06/10/15
 *
 * @author Jacobo Coll <[email protected]>
 */
public class VariantNormalizer implements ParallelTaskRunner.Task {

    protected Logger logger = LoggerFactory.getLogger(this.getClass().toString());

    private boolean reuseVariants = true;
    private boolean normalizeAlleles = false;
    private boolean decomposeMNVs = false;
    private boolean generateReferenceBlocks = false;
    private Map genotypeReorderMapCache = new ConcurrentHashMap<>();
    private final VariantAlternateRearranger.Configuration rearrangerConf = new VariantAlternateRearranger.Configuration();

    private static final Set VALID_NTS = new HashSet<>(Arrays.asList("A", "C", "G", "T", "N"));
    private static final String CNV = "";
    private static final String COPY_NUMBER_TAG = "CN";
    private static final String CIPOS_STRING = "CIPOS";
    private static final String CIEND_STRING = "CIEND";

    private static final String[] ALLELE_TO_STRING = new String[]{"0", "1", "2", "3", "4", "5"};

    public VariantNormalizer() {}

    public VariantNormalizer(boolean reuseVariants) {
        this.reuseVariants = reuseVariants;
    }

    public VariantNormalizer(boolean reuseVariants, boolean normalizeAlleles) {
        this.reuseVariants = reuseVariants;
        this.normalizeAlleles = normalizeAlleles;
    }

    public VariantNormalizer(boolean reuseVariants, boolean normalizeAlleles, boolean decomposeMNVs) {
        this.reuseVariants = reuseVariants;
        this.normalizeAlleles = normalizeAlleles;
        this.decomposeMNVs = decomposeMNVs;
    }

    public boolean isReuseVariants() {
        return reuseVariants;
    }

    public VariantNormalizer setReuseVariants(boolean reuseVariants) {
        this.reuseVariants = reuseVariants;
        return this;
    }

    public boolean isNormalizeAlleles() {
        return normalizeAlleles;
    }

    public boolean isDecomposeMNVs() {
        return decomposeMNVs;
    }

    public VariantNormalizer setDecomposeMNVs(boolean decomposeMNVs) {
        this.decomposeMNVs = decomposeMNVs;
        return this;
    }

    public VariantNormalizer setNormalizeAlleles(boolean normalizeAlleles) {
        this.normalizeAlleles = normalizeAlleles;
        return this;
    }

    public boolean isGenerateReferenceBlocks() {
        return generateReferenceBlocks;
    }

    public VariantNormalizer setGenerateReferenceBlocks(boolean generateReferenceBlocks) {
        this.generateReferenceBlocks = generateReferenceBlocks;
        return this;
    }

    public VariantNormalizer configure(VCFHeader header) {
        rearrangerConf.configure(header);
        return this;
    }

    public VariantNormalizer configure(Collection lines) {
        rearrangerConf.configure(lines);
        return this;
    }

    public VariantNormalizer configure(VCFCompoundHeaderLine line) {
        rearrangerConf.configure(line);
        return this;
    }

    public VariantNormalizer configure(String key, VCFHeaderLineCount number, VCFHeaderLineType type) {
        rearrangerConf.configure(key, number, type);
        return this;
    }

    @Override
    public List apply(List batch) {
        try {
            return normalize(batch, reuseVariants);
        } catch (NonStandardCompliantSampleField e) {
            throw new RuntimeException(e);
        }
    }

    public List normalize(List batch, boolean reuse) throws NonStandardCompliantSampleField {
        List normalizedVariants = new ArrayList<>(batch.size());

        for (Variant variant : batch) {
            if (!isNormalizable(variant)) {
                normalizedVariants.add(variant);
                continue;
            }
            String reference = variant.getReference();  //Save original values, as they can be changed
            String alternate = variant.getAlternate();
            Integer start = variant.getStart();
            Integer end = variant.getEnd();
            String chromosome = variant.getChromosome();

            if (variant.getStudies() == null || variant.getStudies().isEmpty()) {
                List keyFieldsList;
                if (variant.isSymbolic()) {
                    keyFieldsList = normalizeSymbolic(start, end, reference, alternate);
                } else {
                    keyFieldsList = normalize(chromosome, start, reference, alternate);
                }
                // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
                for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
                    String call = start + ":" + reference + ":" + alternate + ":" + keyFields.getNumAllele();
                    Variant normalizedVariant = newVariant(variant, keyFields);
                    if (keyFields.getPhaseSet() != null) {
                        StudyEntry studyEntry = new StudyEntry();
                        studyEntry.setSamplesData(
                                Collections.singletonList(Collections.singletonList(keyFields.getPhaseSet())));
                        studyEntry.setFormat(Collections.singletonList("PS"));
                        // Use mnv string as file Id so that it can be later identified. It is also used
                        // as the genotype call since we don't have an actual call and to avoid confusion
                        studyEntry.setFiles(Collections.singletonList(new FileEntry(keyFields.getPhaseSet(), call, null)));
                        normalizedVariant.setStudies(Collections.singletonList(studyEntry));
                    }
                    normalizedVariants.add(normalizedVariant);
                }
            } else {
                for (StudyEntry entry : variant.getStudies()) {
                    List alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
                    alternates.add(alternate);
                    alternates.addAll(entry.getSecondaryAlternatesAlleles());

                    // FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed
                    List keyFieldsList;
                    List originalKeyFieldsList;
//                    String copyNumberString = null;
                    if (variant.isSymbolic()) {
//                        if (VariantType.CNV.equals(variant.getType())) {
//                            String sampleName = variant.getStudies().get(0).getSamplesName().iterator().next();
//                            copyNumberString = variant.getStudies().get(0).getSampleData(sampleName).get(COPY_NUMBER_TAG);
//                        }
//                        keyFieldsList = normalizeSymbolic(start, end, reference, alternates, copyNumberString);
                        keyFieldsList = normalizeSymbolic(start, end, reference, alternates);
                    } else {
                        keyFieldsList = normalize(chromosome, start, reference, alternates);
                    }
                    originalKeyFieldsList = keyFieldsList.stream().filter(k -> !k.isReferenceBlock()).collect(Collectors.toList());
                    boolean sameVariant = keyFieldsList.size() == 1
                            && keyFieldsList.get(0).getStart() == start
                            && keyFieldsList.get(0).getReference().equals(reference)
                            && keyFieldsList.get(0).getAlternate().equals(alternate);

                    // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
                    for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
                        String call = start + ":" + reference + ":" + String.join(",", alternates) + ":" + keyFields.getNumAllele();

                        final Variant normalizedVariant;
                        final StudyEntry normalizedEntry;
                        final List> samplesData;
                        if (reuse && keyFieldsList.size() == 1) {   //Only reuse for non multiallelic variants
                            //Reuse variant. Set new fields.
                            normalizedVariant = variant;
                            variant.setStart(keyFields.getStart());
                            variant.setEnd(keyFields.getEnd());
                            variant.setReference(keyFields.getReference());
                            variant.setAlternate(keyFields.getAlternate());
                            variant.resetLength();
                            variant.resetType();
                            // Variant is being reused, must ensure the SV field si appropriately created
//                            if (isSymbolic(variant)) {
//                                StructuralVariation sv = getStructuralVariation(variant, keyFields, copyNumberString);
//                                variant.setSv(sv);
//                            }
                            normalizedEntry = entry;
                            entry.getFiles().forEach(fileEntry -> fileEntry.setCall(sameVariant ? "" : call)); //TODO: Check file attributes
                            samplesData = entry.getSamplesData();
                        } else {
                            normalizedVariant = newVariant(variant, keyFields);

                            normalizedEntry = new StudyEntry();
                            normalizedEntry.setStudyId(entry.getStudyId());
                            normalizedEntry.setSamplesPosition(entry.getSamplesPosition());
                            normalizedEntry.setFormat(entry.getFormat());

                            List files = new ArrayList<>(entry.getFiles().size());
                            for (FileEntry file : entry.getFiles()) {
                                HashMap attributes = new HashMap<>(file.getAttributes()); //TODO: Check file attributes
                                files.add(new FileEntry(file.getFileId(), sameVariant ? "" : call, attributes));
                            }
                            normalizedEntry.setFiles(files);
                            normalizedVariant.addStudyEntry(normalizedEntry);
                            samplesData = newSamplesData(entry.getSamplesData().size(), entry.getFormat().size());
                        }

                        if (keyFields.isReferenceBlock()) {
                            normalizedVariant.setType(VariantType.NO_VARIATION);
                            normalizedEntry.getFiles().forEach(fileEntry -> fileEntry.getAttributes().put("END", Integer.toString(keyFields.getEnd())));
                        }


                        //Set normalized secondary alternates
                        List reorderedKeyFields = reorderVariantKeyFields(chromosome, keyFields, keyFieldsList);
                        normalizedEntry.setSecondaryAlternates(getSecondaryAlternates(keyFields, reorderedKeyFields));

                        VariantAlternateRearranger rearranger = null;
                        if (originalKeyFieldsList.size() > 1 && !reorderedKeyFields.isEmpty()) {
                            rearranger = new VariantAlternateRearranger(originalKeyFieldsList, reorderedKeyFields, rearrangerConf);
                        }

                        //Set normalized samples data
                        try {
                            List format = entry.getFormat();
                            if (keyFields.getPhaseSet() != null) {
                                if (!normalizedEntry.getFormatPositions().containsKey("PS")) {
                                    normalizedEntry.addFormat("PS");
                                    format = new ArrayList<>(normalizedEntry.getFormat());
                                }
                                // If no files are provided one must be created to ensure genotype calls are the same
                                // for all mnv-phased variants
                                if (normalizedEntry.getFiles().size() == 0) {
                                    // Use mnv string as file Id so that it can be later identified.
                                    normalizedEntry.setFiles(Collections.singletonList(new FileEntry(keyFields.getPhaseSet(), call, null)));
                                }
                            }
                            List> normalizedSamplesData = normalizeSamplesData(keyFields,
                                    entry.getSamplesData(), format, reference, alternates, rearranger, samplesData);
                            normalizedEntry.setSamplesData(normalizedSamplesData);
                            normalizedVariants.add(normalizedVariant);

                        } catch (Exception e) {
                            logger.warn("Error parsing variant " + call + ", numAllele " + keyFields.getNumAllele(), e);
                            throw e;
                        }
                    }
                }
            }
        }

        return normalizedVariants;
    }

    private Collection sortByPosition(List keyFieldsList) {
        List sortedKeyFields = new ArrayList<>(keyFieldsList);
        sortedKeyFields.sort(Comparator.comparingInt(VariantKeyFields::getStart));
        return sortedKeyFields;
    }

//    private StructuralVariation getStructuralVariation(Variant variant, VariantKeyFields keyFields, String copyNumberString) {
//        int[] impreciseStart = getImpreciseStart(variant);
//        int[] impreciseEnd = getImpreciseEnd(variant);
//        StructuralVariation sv = variant.getSv() == null ? new StructuralVariation() : variant.getSv();
//        if (sv.getCiStartLeft() == null) {
//            sv.setCiStartLeft(impreciseStart[0]);
//        }
//        if (sv.getCiStartRight() == null) {
//            sv.setCiStartRight(impreciseStart[1]);
//        }
//        if (sv.getCiEndLeft() == null) {
//            sv.setCiEndLeft(impreciseEnd[0]);
//        }
//        if (sv.getCiEndRight() == null) {
//            sv.setCiEndRight(impreciseEnd[1]);
//        }
//        if (sv.getCopyNumber() == null && variant.getType().equals(VariantType.CNV)) {
//            if (StringUtils.isNumeric(copyNumberString)) {
//                sv.setCopyNumber(Integer.parseInt(copyNumberString));
//            } else {
//                // Assuming if copy number is not provided in the info field,
//                // it shall be indicated as part of the alternate allele string
//                Integer copyNumber = Variant.getCopyNumberFromAlternate(keyFields.getAlternate());
//                if (copyNumber != null) {
//                    sv.setCopyNumber(copyNumber);
//                }
//            }
//            sv.setType(Variant.getCNVSubtype(sv.getCopyNumber()));
//        }
//        return sv;
//    }

//    private int[] getImpreciseStart(Variant variant) {
//        if (variant.getStudies()!= null
//                && !variant.getStudies().isEmpty()
//                && !variant.getStudies().get(0).getFiles().isEmpty()
//                && variant.getStudies().get(0).getFiles().get(0).getAttributes().containsKey(CIPOS_STRING)) {
//            String[] parts = variant.getStudies().get(0).getFiles().get(0).getAttributes().get(CIPOS_STRING).split(",");
//            return new int[]{variant.getStart() + Integer.parseInt(parts[0]),
//                    variant.getStart() + Integer.parseInt(parts[1])};
//        } else {
//            return new int[]{variant.getStart(), variant.getStart()};
//        }
//    }
//
//    private int[] getImpreciseEnd(Variant variant) {
//        if (variant.getStudies()!= null
//                && !variant.getStudies().isEmpty()
//                && !variant.getStudies().get(0).getFiles().isEmpty()
//                && variant.getStudies().get(0).getFiles().get(0).getAttributes().containsKey(CIEND_STRING)) {
//            String[] parts = variant.getStudies().get(0).getFiles().get(0).getAttributes().get(CIEND_STRING).split(",");
//            return new int[]{variant.getEnd() + Integer.parseInt(parts[0]),
//                    variant.getEnd() + Integer.parseInt(parts[1])};
//        } else {
//            return new int[]{variant.getEnd(), variant.getEnd()};
//        }
//    }

    public List normalizeSymbolic(Integer start, Integer end, String reference, String alternate) {
        return normalizeSymbolic(start, end, reference, Collections.singletonList(alternate));
//        return normalizeSymbolic(start, end, reference, Collections.singletonList(alternate), copyNumber);
    }

    public List normalizeSymbolic(Integer start, Integer end, String reference,
                                                    List alternates) {
        List list = new ArrayList<>(alternates.size());

        String newReference = reference;
        // Reference for SVs must be empty
        if (reference.length() == 1) {
            newReference = "";
            start++;
        } else if (reference.length() > 1) {
            throw new IllegalArgumentException("Invalid reference value found for symbolic variant " + start + "-"
                    + end + ":" + reference + ":" + String.join(",", alternates) + ". Reference can only "
                    + "cotain 0 or 1 nt, but no more. Please, check.");
        }

        int numAllelesIdx = 0; // This index is necessary for getting the samples where the mutated allele is present
        for (Iterator iterator = alternates.iterator(); iterator.hasNext(); numAllelesIdx++) {
            String newAlternate = iterator.next();
//            if (newAlternate.equals(CNV) && StringUtils.isNotEmpty(copyNumber)) {
//                // Alternate must be of the form , being xxx the number of copies
//                newAlternate = "";
//            }
            list.add(new VariantKeyFields(start, end, numAllelesIdx, newReference, newAlternate));
        }

        return list;
    }


    public List normalize(String chromosome, int position, String reference, String alternate) {
        return normalize(chromosome, position, reference, Collections.singletonList(alternate));
    }

    public List normalize(String chromosome, int position, String reference, List alternates) {
        List list = new ArrayList<>(alternates.size());
        int numAllelesIdx = 0; // This index is necessary for getting the samples where the mutated allele is present
        for (Iterator iterator = alternates.iterator(); iterator.hasNext(); numAllelesIdx++) {
            String currentAlternate = iterator.next();
            int referenceLen = reference.length();
            int alternateLen = currentAlternate.length();

            VariantKeyFields keyFields;
            if (referenceLen == 0) {
                keyFields = createVariantsFromInsertionEmptyRef(position, currentAlternate);
            } else if (alternateLen == 0) {
                keyFields = createVariantsFromDeletionEmptyAlt(position, reference);
            } else {
                keyFields = createVariantsFromNoEmptyRefAlt(position, reference, currentAlternate);
            }
            if (keyFields != null) {

                // To deal with cases such as A>GT
                boolean isMnv = (keyFields.getReference().length() > 1 && keyFields.getAlternate().length() >= 1)
                        || (keyFields.getAlternate().length() > 1 && keyFields.getReference().length() >= 1);
                if (decomposeMNVs && isMnv && alternates.size() == 1) {
                    for (VariantKeyFields keyFields1 : decomposeMNVSingleVariants(keyFields)) {
                        keyFields1.numAllele = numAllelesIdx;
                        keyFields1.phaseSet = chromosome + ":" + position + ":" + reference + ":" + currentAlternate;
                        list.add(keyFields1);
                    }
                } else {
                    if (decomposeMNVs && isMnv) {
                        logger.warn("Unable to decompose multiallelic with MNV variants -> "
                                + chromosome + ":" + position + ":" + reference + ":" + String.join(",", alternates));
                    }
                    keyFields.numAllele = numAllelesIdx;
                    list.add(keyFields);
                }
            }
        }

        if (generateReferenceBlocks) {
            list = generateReferenceBlocks(list, position, reference);
        }

        // Sort by numAllele, then by start.
        list.sort(Comparator.comparingInt(VariantKeyFields::getNumAllele).thenComparingInt(VariantKeyFields::getStart));
        return list;
    }

    private List generateReferenceBlocks(List list, int position, String reference) {

        // Skip for simple SNV
        if (list.size() == 1 && list.get(0).getStart() == position && list.get(0).getReference().equals(reference)) {
            return list;
        }

        // Create a reference block for all the positions.
        list.add(new VariantKeyFields(position, position + reference.length() - 1, 0, reference.substring(0, 1), "", true));

        // Have to remove overlapped reference blocks.
        for (int i = 0; i < list.size(); i++) {
            //Don't use iterators to avoid concurrent modification exceptions
            VariantKeyFields current = list.get(i);
            boolean isFinished = false;
            while (current.isReferenceBlock() && !isFinished) {
                VariantKeyFields newSlice = null;
                for (VariantKeyFields aux : list) {
                    if (aux == current) {
                        // Skip self
                        continue;
                    } else {
                        if (aux.getStart() <= current.getStart() && aux.getReferenceEnd() >= current.getEnd()) {
                            /* 1)
                             *     |----|   <- current
                             *  |--------|  <- aux
                             */
                            list.remove(i--);
                            break;
                        } else if (aux.getStart() <= current.getStart() && current.getStart() <= aux.getReferenceEnd()) {
                            /* 2)
                             *     |----|   <- current
                             *  |-----|     <- aux
                             */
                            if (aux.isReferenceBlock()) {
                                //Merge reference blocks
                                aux.setEnd(current.getEnd());
                                list.remove(i--);
                                break;
                            } else {
                                current.setStart(aux.getReferenceEnd() + 1);
                                // Have to find the correct current reference from the original reference
                                current.setReference(reference.substring(current.getStart() - position, current.getStart() - position + 1));
                            }
                        } else if (aux.getReferenceEnd() >= current.getEnd() && aux.getStart() <= current.getEnd()) {
                            /* 3)
                             *   |-----|    <- current
                             *     |-----|  <- aux
                             */
                            if (aux.isReferenceBlock()) {
                                //Merge reference blocks
                                aux.setStart(current.getStart());
                                aux.setReference(current.getReference());
                                list.remove(i--);
                                break;
                            } else {
                                current.setEnd(aux.getStart() - 1);
                            }
                        } else if (aux.getStart() <= current.getEnd() && aux.getReferenceEnd() > current.getStart()) {
                            /* 4) As first case, but upside down
                             *  |-------|  <- current
                             *    |--|     <- aux
                             */
                            if (!aux.isReferenceBlock()) {
                                /* Split the current block into 2 blocks
                                 *  |-|  |--|  <- current + newSlip
                                 *    |--|     <- aux
                                 */
                                int blockStart = aux.getReferenceEnd() + 1;
                                int blockEnd = current.getEnd();
                                if (blockEnd >= blockStart) {
                                    // Have to find the correct current reference from the original reference
                                    String blockRef = reference.substring(blockStart - position, blockStart - position + 1);
                                    if (newSlice != null) {
                                        throw new IllegalStateException();
                                    }
                                    newSlice = new VariantKeyFields(blockStart,  blockEnd, 0, blockRef, "", true);
                                }

                                current.setEnd(aux.getStart() - 1);
                                if (current.getEnd() < current.getStart()) {
                                    list.remove(i--);
                                }
                                break;
                            } // else, will be fixed later

                        }   /* else, nothing to do
                             * 5)
                             *  |--|       <- current
                             *       |--|  <- aux
                             */
                    }
                }
                if (newSlice != null) {
                    list.add(newSlice);
                } else {
                    isFinished = true;
                }
            }
        }
        return list;
    }

    private List decomposeMNVSingleVariants(VariantKeyFields keyFields) {
        SequencePair sequenceAlignment = getPairwiseAlignment(keyFields.getReference(),
                keyFields.getAlternate());
        return decomposeAlignmentSingleVariants(sequenceAlignment, keyFields.getStart());
    }

    private List decomposeAlignmentSingleVariants(SequencePair sequenceAlignment,
                                                                    int genomicStart) {

        String reference = sequenceAlignment.getTarget().getSequenceAsString();
        String alternate = sequenceAlignment.getQuery().getSequenceAsString();
        List keyFieldsList = new ArrayList<>();
        VariantKeyFields keyFields = null;
        char previousReferenceChar = 0;
        char previousAlternateChar = 0;
        // Assume that as a result of the alignment "reference" and "alternate" Strings are of the same length
        for (int i = 0; i < reference.length(); i++) {
            char referenceChar = reference.charAt(i);
            char alternateChar = alternate.charAt(i);
            // Insertion
            if (referenceChar == '-') {
                // Assume there cannot be a '-' at the reference and alternate aligned sequences at the same position
                if (alternateChar == '-') {
                    logger.error("Unhandled case found after pairwise alignment of MNVs. Alignment result: "
                            + reference + "/" + alternate);
                }
                // Current character is a continuation of an insertion
                if (previousReferenceChar == '-') {
                    keyFields.setAlternate(keyFields.getAlternate() + alternateChar);
                // New insertion found, create new keyFields
                } else {
                    keyFields = new VariantKeyFields(genomicStart + i, genomicStart + i, "",
                            String.valueOf(alternateChar));
                    keyFieldsList.add(keyFields);
                }
            // Deletion
            } else if (alternateChar == '-') {
                // Current character is a continuation of a deletion
                if (previousAlternateChar == '-') {
                    keyFields.setReference(keyFields.getReference() + referenceChar);
                    keyFields.setEnd(keyFields.getEnd()+1);
                // New deletion found, create new keyFields
                } else {
                    keyFields = new VariantKeyFields(genomicStart + i, genomicStart + i, String.valueOf(referenceChar),
                            "");
                    keyFieldsList.add(keyFields);
                }
            // SNV
            } else if (referenceChar != alternateChar) {
                keyFields = new VariantKeyFields(genomicStart + i, genomicStart + i, String.valueOf(referenceChar),
                        String.valueOf(alternateChar));
                keyFieldsList.add(keyFields);
            }
            previousReferenceChar = referenceChar;
            previousAlternateChar = alternateChar;
        }

        return keyFieldsList;
    }

    private SequencePair getPairwiseAlignment(String seq1, String seq2) {
        DNASequence target = null;
        DNASequence query = null;
        try {
            target = new DNASequence(seq1, AmbiguityDNACompoundSet.getDNACompoundSet());
            query = new DNASequence(seq2, AmbiguityDNACompoundSet.getDNACompoundSet());
        } catch (CompoundNotFoundException e) {
            String msg = "Error when creating DNASequence objects for " + seq1 + " and " + seq2 + " prior to pairwise "
                    + "sequence alignment";
            logger.error(msg, e);
            throw new VariantNormalizerException(msg, e);
        }
        SubstitutionMatrix substitutionMatrix = SubstitutionMatrixHelper.getNuc4_4();
        SimpleGapPenalty gapP = new SimpleGapPenalty();
        gapP.setOpenPenalty((short)5);
        gapP.setExtensionPenalty((short)2);
        SequencePair psa = Alignments.getPairwiseAlignment(query, target,
                Alignments.PairwiseSequenceAlignerType.GLOBAL, gapP, substitutionMatrix);

        return psa;
    }

    /**
     * Non normalizable variants
     * TODO: Add {@link VariantType#SYMBOLIC} variants?
     */
    private boolean isNormalizable(Variant variant) {
        return !variant.getType().equals(VariantType.NO_VARIATION) && !variant.getType().equals(VariantType.SYMBOLIC);
    }

    protected VariantKeyFields createVariantsFromInsertionEmptyRef(int position, String alt) {
        return new VariantKeyFields(position, position - 1, "", alt);
    }

    protected VariantKeyFields createVariantsFromDeletionEmptyAlt(int position, String reference) {
        return new VariantKeyFields(position, position + reference.length() - 1, reference, "");
    }

    /**
     * Calculates the start, end, reference and alternate of an SNV/MNV/INDEL where the
     * reference and the alternate are not empty.
     *
     * This task comprises 2 steps: removing the trailing bases that are
     * identical in both alleles, then the leading identical bases.
     *
     * @param position Input starting position
     * @param reference Input reference allele
     * @param alt Input alternate allele
     * @return The new start, end, reference and alternate alleles
     */
    protected VariantKeyFields createVariantsFromNoEmptyRefAlt(int position, String reference, String alt) {
        int indexOfDifference;
        // Remove the trailing bases
        indexOfDifference = reverseIndexOfDifference(reference, alt);

//        VariantKeyFields startReferenceBlock = null;
        final VariantKeyFields keyFields;
//        VariantKeyFields endReferenceBlock = null;

//        if (generateReferenceBlocks) {
//            if (indexOfDifference > 0) {
//                //Generate a reference block from the trailing bases
//                String blockRef = reference.substring(reference.length() - indexOfDifference, reference.length() - indexOfDifference + 1);
//                int blockStart = position + reference.length() - indexOfDifference;
//                int blockEnd = position + reference.length() - 1;   // Base-1 ending
//                endReferenceBlock = new VariantKeyFields(blockStart, blockEnd, blockRef, "")
//                        .setReferenceBlock(true);
//            } else if (indexOfDifference < 0) {
//                //Reference and alternate are equals! Generate a single reference block
//                String blockRef = reference.substring(0, 1);
//                int blockStart = position;
//                int blockEnd = position + reference.length() - 1;   // Base-1 ending
//                VariantKeyFields referenceBlock = new VariantKeyFields(blockStart, blockEnd, blockRef, "")
//                        .setReferenceBlock(true);
//                return Collections.singletonList(referenceBlock);
//            }
//        }


        reference = reference.substring(0, reference.length() - indexOfDifference);
        alt = alt.substring(0, alt.length() - indexOfDifference);

        // Remove the leading bases
        indexOfDifference = StringUtils.indexOfDifference(reference, alt);
        if (indexOfDifference < 0) {
            //There reference and the alternate are the same
            return null;
        } else if (indexOfDifference == 0) {
            keyFields = new VariantKeyFields(position, position + reference.length() - 1, reference, alt);
//            if (reference.length() > alt.length()) { // Deletion
//                keyFields = new VariantKeyFields(position, position + reference.length() - 1, reference, alt);
//            } else { // Insertion
//                keyFields = new VariantKeyFields(position, position + alt.length() - 1, reference, alt);
//            }
        } else {
//            if (generateReferenceBlocks) {
//                String blockRef = reference.substring(0, 1);
//                int blockStart = position;
//                int blockEnd = position + indexOfDifference - 1;   // Base-1 ending
//                startReferenceBlock = new VariantKeyFields(blockStart, blockEnd, blockRef, "")
//                        .setReferenceBlock(true);
//            }

            int start = position + indexOfDifference;
            String ref = reference.substring(indexOfDifference);
            String inAlt = alt.substring(indexOfDifference);
//            int end = reference.length() > alt.length()
//                    ? position + reference.length() - 1
//                    : position + alt.length() - 1;
            int end = position + reference.length() - 1;
            keyFields = new VariantKeyFields(start, end, ref, inAlt);
        }

//        if (!generateReferenceBlocks) {
//            return Collections.singletonList(keyFields);
//        } else {
//            List list = new ArrayList<>(1 + (startReferenceBlock == null ? 0 : 1) + (endReferenceBlock == null ? 0 : 1));
//            if (startReferenceBlock != null) {
//                list.add(startReferenceBlock);
//            }
//            list.add(keyFields);
//            if (endReferenceBlock != null) {
//                list.add(endReferenceBlock);
//            }
//            return list;
//        }
        return keyFields;
    }

    /**
     * 

Compares two CharSequences, and returns the index beginning from the behind, * at which the CharSequences begin to differ.

* * Based on {@link StringUtils#indexOfDifference} * *

For example, * {@code reverseIndexOfDifference("you are a machine", "i have one machine") -> 8}

* *
     * reverseIndexOfDifference(null, null) = -1
     * reverseIndexOfDifference("", "") = -1
     * reverseIndexOfDifference("", "abc") = 0
     * reverseIndexOfDifference("abc", "") = 0
     * reverseIndexOfDifference("abc", "abc") = -1
     * reverseIndexOfDifference("ab", "xyzab") = 2
     * reverseIndexOfDifference("abcde", "xyzab") = 2
     * reverseIndexOfDifference("abcde", "xyz") = 0
     * 
* * @param cs1 the first CharSequence, may be null * @param cs2 the second CharSequence, may be null * @return the index from behind where cs1 and cs2 begin to differ; -1 if they are equal */ public static int reverseIndexOfDifference(final CharSequence cs1, final CharSequence cs2) { if (cs1 == cs2) { return StringUtils.INDEX_NOT_FOUND; } if (cs1 == null || cs2 == null) { return 0; } int i; int cs1Length = cs1.length(); int cs2Length = cs2.length(); for (i = 0; i < cs1Length && i < cs2Length; ++i) { if (cs1.charAt(cs1Length - i - 1) != cs2.charAt(cs2Length - i - 1)) { break; } } if (i < cs2Length || i < cs1Length) { return i; } return StringUtils.INDEX_NOT_FOUND; } public List> normalizeSamplesData(VariantKeyFields variantKeyFields, final List> samplesData, List format, String reference, List alternateAlleles, VariantAlternateRearranger rearranger) throws NonStandardCompliantSampleField { return normalizeSamplesData(variantKeyFields, samplesData, format, reference, alternateAlleles, rearranger, null); } public List> normalizeSamplesData(VariantKeyFields variantKeyFields, final List> samplesData, List format, String reference, List alternateAlleles, VariantAlternateRearranger rearranger, List> reuseSampleData) throws NonStandardCompliantSampleField { List> newSampleData; if (reuseSampleData == null) { newSampleData = newSamplesData(samplesData.size(), format.size()); } else { newSampleData = reuseSampleData; } // Normalizing an mnv and no sample data was provided in the original variant - need to create sample data to // indicate the phase set if (variantKeyFields.getPhaseSet() != null && samplesData.size() == 0) { if (format.equals(Collections.singletonList("PS"))) { newSampleData.add(Collections.singletonList(variantKeyFields.getPhaseSet())); } else { List sampleData = new ArrayList<>(format.size()); for (String f : format) { if (f.equals("PS")) { sampleData.add(variantKeyFields.getPhaseSet()); } else { sampleData.add(""); } } newSampleData.add(sampleData); } } else { for (int sampleIdx = 0; sampleIdx < samplesData.size(); sampleIdx++) { List sampleData = samplesData.get(sampleIdx); // TODO we could check that format and sampleData sizes are equals // if (sampleData.size() == 1 && sampleData.get(0).equals(".")) { // newSampleData.get(sampleIdx).set(0, "./."); // System.out.println("Format data equals '.'"); // continue; // } Genotype genotype = null; for (int formatFieldIdx = 0; formatFieldIdx < format.size(); formatFieldIdx++) { String formatField = format.get(formatFieldIdx); // It may happen that the Format contains other fields that were not in the original format, // or that some sampleData array is smaller than the format list. // If the variant was a splitted MNV, a new field 'PS' is added to the format (if missing), so it may // not be in the original sampleData. String sampleField = sampleData.size() > formatFieldIdx ? sampleData.get(formatFieldIdx) : ""; if (formatField.equalsIgnoreCase("GT")) { // Save alleles just in case they are necessary for GL/PL/GP transformation // Use the original alternates to create the genotype. genotype = new Genotype(sampleField, reference, alternateAlleles); if (variantKeyFields.isReferenceBlock()) { int[] allelesIdx = genotype.getAllelesIdx(); for (int i = 0; i < allelesIdx.length; i++) { if (allelesIdx[i] > 0) { allelesIdx[i] = 0; } } } else if (rearranger != null) { genotype = rearranger.rearrangeGenotype(genotype); } if (normalizeAlleles && !genotype.isPhased()) { genotype.normalizeAllelesIdx(); } sampleField = genotype.toString(); } else if (formatField.equals("PS")) { if (variantKeyFields.getPhaseSet() != null) { sampleField = variantKeyFields.getPhaseSet(); } } else { if (rearranger != null) { sampleField = rearranger.rearrange(formatField, sampleField, genotype == null ? null : genotype.getPloidy()); } } List data = newSampleData.get(sampleIdx); int finalSampleIdx = sampleIdx; if (data.size() > formatFieldIdx) { secureSet(data, formatFieldIdx, sampleField, list -> newSampleData.set(finalSampleIdx, list)); } else { secureAdd(data, sampleField, list -> newSampleData.set(finalSampleIdx, list)); } } } } return newSampleData; } private List secureAdd(List list, T data, Consumer> onNewList) { try { list.add(data); } catch (UnsupportedOperationException e) { list = new ArrayList<>(list); list.add(data); onNewList.accept(list); } return list; } private List secureSet(List list, int idx, T data, Consumer> onNewList) { try { list.set(idx, data); } catch (UnsupportedOperationException e ) { list = new ArrayList<>(list); list.set(idx, data); onNewList.accept(list); } return list; } /** * Gets an array for reordening the positions of a format field with Number=G and ploidy=2. * * In those fields where there is a value per genotype, if we change the order of the alleles, * the order of the genotypes will also change. * * The genotypes ordering is defined in the Vcf spec : https://samtools.github.io/hts-specs/VCFv4.3.pdf as "Genotype Ordering" * Given a number of alleles N and a ploidy of P, the order algorithm is: * for (a_p = 0; a_p < N: a_p++) * for (a_p-1 = 0; a_p-1 <= a_p: a_p-1++) * ... * for (a_1 = 0; a_1 <= a_2: a_1++) * print(a_1, a_2, ..., a_p) * * i.e: * N=2,P=2: 00,01,11,02,12,22 * N=3,P=2: 00,01,11,02,12,22,03,13,23,33 * * With P=2, given a genotype a/b, where a2 */ private int[] getGenotypesReorderingMap(int numAllele, int[] alleleMap) { int numAlleles = alleleMap.length; // int ploidy = 2; int key = numAllele * 100 + numAlleles; int[] map = genotypeReorderMapCache.get(key); if (map != null) { return map; } else { ArrayList mapList = new ArrayList<>(); for (int originalA2 = 0; originalA2 < alleleMap.length; originalA2++) { int newA2 = ArrayUtils.indexOf(alleleMap, originalA2); for (int originalA1 = 0; originalA1 <= originalA2; originalA1++) { int newA1 = ArrayUtils.indexOf(alleleMap, originalA1); if (newA1 <= newA2) { mapList.add((newA2 * (newA2 + 1) / 2 + newA1)); } else { mapList.add((newA1 * (newA1 + 1) / 2 + newA2)); } } } map = new int[mapList.size()]; for (int i = 0; i < mapList.size(); i++) { map[i] = mapList.get(i); } genotypeReorderMapCache.put(key, map); return map; } } private Variant newVariant(Variant variant, VariantKeyFields keyFields) { Variant normalizedVariant = new Variant(variant.getChromosome(), keyFields.getStart(), keyFields.getEnd(), keyFields.getReference(), keyFields.getAlternate()); normalizedVariant.setIds(variant.getIds()); normalizedVariant.setStrand(variant.getStrand()); // normalizedVariant.setAnnotation(variant.getAnnotation()); // if (isSymbolic(variant)) { // StructuralVariation sv = getStructuralVariation(normalizedVariant, keyFields, null); // normalizedVariant.setSv(sv); // } return normalizedVariant; } public List reorderVariantKeyFields(String chromosome, VariantKeyFields alternate, List alternates) { List secondaryAlternates; if (alternates.size() == 1 || alternate.isReferenceBlock()) { // If there is only one alternate, there are no secondary alternates // Reference blocks do not have secondary alternates secondaryAlternates = Collections.emptyList(); } else if (alternate.getPhaseSet() != null) { for (VariantKeyFields variantKeyFields : alternates) { if (variantKeyFields.getNumAllele() > 0) { throw new IllegalStateException("Unable to resolve multiallelic with MNV variants -> " + alternates.stream() .map((v) -> chromosome + ":" + v.toString()) .collect(Collectors.joining(" , "))); } } secondaryAlternates = Collections.emptyList(); } else { secondaryAlternates = new ArrayList<>(alternates.size()); // Move the current alternate to the first position secondaryAlternates.add(alternate); for (VariantKeyFields keyFields : alternates) { if (keyFields.isReferenceBlock()) { continue; } if (!keyFields.equals(alternate)) { secondaryAlternates.add(keyFields); } } } return secondaryAlternates; } public List getSecondaryAlternates(VariantKeyFields alternate, List reorderedKeyFields) { List secondaryAlternates = new ArrayList<>(reorderedKeyFields.size()); for (VariantKeyFields keyFields : reorderedKeyFields) { if (!keyFields.equals(alternate)) { secondaryAlternates.add(new AlternateCoordinate( // Chromosome is always the same, do not set null, //Set position only if is different from the original one alternate.getStart() == keyFields.getStart() ? null : keyFields.getStart(), alternate.getEnd() == keyFields.getEnd() ? null : keyFields.getEnd(), //Set reference only if is different from the original one alternate.getReference().equals(keyFields.getReference()) ? null : keyFields.getReference(), keyFields.getAlternate(), Variant.inferType(keyFields.getReference(), keyFields.getAlternate()) )); } } return secondaryAlternates; } private List> newSamplesData(int samplesSize, int formatSize) { List> newSampleData; newSampleData = new ArrayList<>(samplesSize); for (int i = 0; i < samplesSize; i++) { newSampleData.add(Arrays.asList(new String[formatSize])); } return newSampleData; } public static class VariantKeyFields { private int start; private int end; private int numAllele; private String phaseSet; private String reference; private String alternate; boolean referenceBlock; public VariantKeyFields(int start, int end, String reference, String alternate) { this(start, end, 0, reference, alternate, false); } public VariantKeyFields(int start, int end, int numAllele, String reference, String alternate) { this(start, end, numAllele, reference, alternate, false); } public VariantKeyFields(int start, int end, int numAllele, String reference, String alternate, boolean referenceBlock) { this.start = start; this.end = end; this.numAllele = numAllele; this.reference = reference; this.alternate = alternate; this.referenceBlock = referenceBlock; } public int getStart() { return start; } public VariantKeyFields setStart(int start) { this.start = start; return this; } public int getEnd() { return end; } public int getReferenceEnd() { return start + reference.length() - 1; } public VariantKeyFields setEnd(int end) { this.end = end; return this; } public int getNumAllele() { return numAllele; } public VariantKeyFields setNumAllele(int numAllele) { this.numAllele = numAllele; return this; } public String getPhaseSet() { return phaseSet; } public VariantKeyFields setPhaseSet(String phaseSet) { this.phaseSet = phaseSet; return this; } public String getReference() { return reference; } public VariantKeyFields setReference(String reference) { this.reference = reference; return this; } public String getAlternate() { return alternate; } public VariantKeyFields setAlternate(String alternate) { this.alternate = alternate; return this; } public boolean isReferenceBlock() { return referenceBlock; } public VariantKeyFields setReferenceBlock(boolean referenceBlock) { this.referenceBlock = referenceBlock; return this; } @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof VariantKeyFields)) return false; VariantKeyFields that = (VariantKeyFields) o; if (start != that.start) return false; if (end != that.end) return false; if (numAllele != that.numAllele) return false; if (reference != null ? !reference.equals(that.reference) : that.reference != null) return false; return !(alternate != null ? !alternate.equals(that.alternate) : that.alternate != null); } @Override public int hashCode() { int result = start; result = 31 * result + end; result = 31 * result + numAllele; result = 31 * result + (reference != null ? reference.hashCode() : 0); result = 31 * result + (alternate != null ? alternate.hashCode() : 0); return result; } @Override public String toString() { return start + "-" + end + ":" + reference + ":" + alternate + ":" + numAllele + (phaseSet == null ? "" : ("(ps:" + phaseSet + ")")) + (referenceBlock ? ("(refBlock)") : ""); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy