org.broadinstitute.hellbender.testutils.TestingReadThreadingGraph Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.testutils;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public final class TestingReadThreadingGraph extends ReadThreadingGraph {
    /*************************************************************
     * Simple string representation support for testing purposes *
     *************************************************************/

    private static final Pattern PROPERTIES_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*\\]");
    private static final Pattern PATH_PATTERN = Pattern.compile("\\{((\\S+):)?([^\\}]*)\\}");
    private static final Pattern KMERSIZE_EXTRACTOR_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*(ks|kmerSize)\\s*=\\s*(\\d+)\\s*[,\\]]");
    private static final long serialVersionUID = 1l;


    /**
     * Constructs a read-threading-graph for a string representation.
     *
     * 
     *     Note: only used for testing.
     *     Checkout {@see HaplotypeGraphUnitTest} for examples.
     * 
     * @param s the string representation of the graph {@code null}.
     */
    public TestingReadThreadingGraph(final String s) {
        super(kmerSizeFromString(s));
        applyString(s);
        setAlreadyBuilt();
    }

    /**
     * Obtain the kmer size for the string representation.
     * @param str the source string representation.
     * @return 1 or greater.
     * @throws IllegalArgumentException if {@code} str does not contain a valid representation.
     */
    private static int kmerSizeFromString(final String str) {
        final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str);
        if (matcher.find()) {
            return Integer.parseInt(matcher.group(2));
        } else {
            throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize");
        }
    }

    /**
     * Apply description string into the graph.
     *
     * 
     *     Note: this is done just for testing purposes.
     *     Checkout {@see HaplotypeGraphUnitTest} for examples.
     * 
     * @param str the string representation.
     */
    private void applyString(final String str) {
        final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str);
        final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0;

        final String pathString = str.substring(pathStart);
        final Matcher pathMatcher = PATH_PATTERN.matcher(pathString);

        boolean referenceFound = false;
        final Map vertexById = new HashMap<>();

        // Loop between path strings and add them one by one.
        while (pathMatcher.find()) {
            final String label = pathMatcher.group(2);
            final boolean isReference = "REF".equals(label);
            if (referenceFound) {
                Utils.validateArg(!isReference, "there are two reference paths");
            } else if ( isReference ) {
                referenceFound = true;
            }

            // Divide each path into its elements getting a list of sequences and labels if applies:
            final String elementsString = pathMatcher.group(3);
            final String[] elements = elementsString.split("\\s*->\\s*");
            Utils.validateArg(elements.length > 0, "empty path not allowed");
            final String[] seqs = new String[elements.length];
            final String[] ids = new String[elements.length];
            for (int i = 0; i < elements.length; i++) {
                ids[i] = pathElementId(elements[i]);
                seqs[i] = pathElementSeq(elements[i]);
                Utils.validateArg(!(seqs[i].isEmpty() && ids[i] == null), "path with empty element without an id");
            }
            final boolean isSource =  ids[0] == null || !vertexById.containsKey(ids[0]);
            if (isSource && seqs[0].length() != kmerSize) {
                throw new IllegalArgumentException("source sequence length must be the same as the kmerSize "
                        + ids[0] + ' ' + seqs[0] + ' ' + pathMatcher.group());
            }
            final MultiDeBruijnVertex firstVertex;
            if (ids[0] != null && vertexById.containsKey(ids[0])) {
                firstVertex = vertexById.get(ids[0]);
            } else {
                firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes());
                addVertex(firstVertex);
                if (ids[0] != null) {
                    vertexById.put(ids[0], firstVertex);
                }
            }
            if (!seqs[0].isEmpty() &&
                    ((isSource && !firstVertex.getSequenceString().equals(seqs[0]))
                            || (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0]))) {
                throw new IllegalArgumentException("mismatched first element sequence");
            }

            MultiDeBruijnVertex lastVertex = firstVertex;
            for (int i = 1; i < elements.length; i++) {
                //TODO: code and comment disagree
                Utils.validateArg(seqs[i].length() <= 1, "non-source vertex sequence must have length 1");
                final MultiDeBruijnVertex nextVertex;
                if (ids[i] == null || !vertexById.containsKey(ids[i])) {
                    final Set nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]);
                    if (nextVertices.isEmpty()) {
                        nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0]));
                        addVertex(nextVertex);
                    } else {
                        nextVertex = nextVertices.iterator().next();
                    }
                    if (ids[i] != null) {
                        vertexById.put(ids[i], nextVertex);
                    }
                } else {
                    nextVertex = vertexById.get(ids[i]);
                }
                final MultiSampleEdge edge = addEdge(lastVertex,nextVertex);
                if (isReference) {
                    edge.setIsRef(true);
                }
                lastVertex = nextVertex;
            }
        }
    }

    /**
     * Return the collection of outgoing vertices that expand this vertex with a particular base.
     *
     * @param v original vertex.
     * @param b expanding base.
     * @return never null, but perhaps an empty set. You cannot assume that you can modify the result.
     */
    @VisibleForTesting
    Set getNextVertices(final MultiDeBruijnVertex v, final byte b) {
        Utils.nonNull(v, "the input vertex cannot be null");
        Utils.validateArg(vertexSet().contains(v), "the vertex must be present in the graph");
        final List result = new LinkedList<>();
        for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) {
            if (w.getSuffix() == b) {
                result.add(w);
            }
        }
        switch (result.size()) {
            case 0: return Collections.emptySet();
            case 1: return Collections.singleton(result.get(0));
            default:
                return new HashSet<>(result);
        }
    }

    private static String pathElementId(final String element) {
        final int openBracketPosition = element.indexOf('(');

        if (openBracketPosition == -1) {
            return null;
        }

        final int closeBracketPosition = element.lastIndexOf(')');
        Utils.validateArg(closeBracketPosition != -1, () -> "non-closed id parantesys found in element: " + element);
        final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim();
        Utils.validateArg(!result.isEmpty(), () -> "empty id found in element: " + element);
        return result;
    }

    /**
     * Returns the lenght of a path element in the string representation.
     * @param element the query element.
     * @return 0 or greater.
     */
    private static String pathElementSeq(final String element) {
        final int parentesysPos = element.indexOf('(');

        if (parentesysPos == -1) {
            return element.trim();
        }

        return element.substring(0,parentesysPos).trim();
    }

    /**
     * Add a base to the end of a byte sequence.
     * @param sequence sequence where to add the base to.
     * @param b base to add.
     * @return never {@code null}, a new array each time.
     */
    private static byte[] extendSequence(final byte[] sequence, final byte b) {
        final byte[] result = new byte[sequence.length];
        System.arraycopy(sequence, 1, result, 0, sequence.length - 1);
        result[result.length - 1] = b;
        return result;
    }

    @Override
    public TestingReadThreadingGraph clone() {
        return (TestingReadThreadingGraph) super.clone();
    }
}