All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.testutils.TestingReadThreadingGraph Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.testutils;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public final class TestingReadThreadingGraph extends ReadThreadingGraph {
    /*************************************************************
     * Simple string representation support for testing purposes *
     *************************************************************/

    private static final Pattern PROPERTIES_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*\\]");
    private static final Pattern PATH_PATTERN = Pattern.compile("\\{((\\S+):)?([^\\}]*)\\}");
    private static final Pattern KMERSIZE_EXTRACTOR_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*(ks|kmerSize)\\s*=\\s*(\\d+)\\s*[,\\]]");
    private static final long serialVersionUID = 1l;


    /**
     * Constructs a read-threading-graph for a string representation.
     *
     * 

* Note: only used for testing. * Checkout {@see HaplotypeGraphUnitTest} for examples. *

* @param s the string representation of the graph {@code null}. */ public TestingReadThreadingGraph(final String s) { super(kmerSizeFromString(s)); applyString(s); setAlreadyBuilt(); } /** * Obtain the kmer size for the string representation. * @param str the source string representation. * @return 1 or greater. * @throws IllegalArgumentException if {@code} str does not contain a valid representation. */ private static int kmerSizeFromString(final String str) { final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str); if (matcher.find()) { return Integer.parseInt(matcher.group(2)); } else { throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize"); } } /** * Apply description string into the graph. * *

* Note: this is done just for testing purposes. * Checkout {@see HaplotypeGraphUnitTest} for examples. *

* @param str the string representation. */ private void applyString(final String str) { final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str); final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0; final String pathString = str.substring(pathStart); final Matcher pathMatcher = PATH_PATTERN.matcher(pathString); boolean referenceFound = false; final Map vertexById = new HashMap<>(); // Loop between path strings and add them one by one. while (pathMatcher.find()) { final String label = pathMatcher.group(2); final boolean isReference = "REF".equals(label); if (referenceFound) { Utils.validateArg(!isReference, "there are two reference paths"); } else if ( isReference ) { referenceFound = true; } // Divide each path into its elements getting a list of sequences and labels if applies: final String elementsString = pathMatcher.group(3); final String[] elements = elementsString.split("\\s*->\\s*"); Utils.validateArg(elements.length > 0, "empty path not allowed"); final String[] seqs = new String[elements.length]; final String[] ids = new String[elements.length]; for (int i = 0; i < elements.length; i++) { ids[i] = pathElementId(elements[i]); seqs[i] = pathElementSeq(elements[i]); Utils.validateArg(!(seqs[i].isEmpty() && ids[i] == null), "path with empty element without an id"); } final boolean isSource = ids[0] == null || !vertexById.containsKey(ids[0]); if (isSource && seqs[0].length() != kmerSize) { throw new IllegalArgumentException("source sequence length must be the same as the kmerSize " + ids[0] + ' ' + seqs[0] + ' ' + pathMatcher.group()); } final MultiDeBruijnVertex firstVertex; if (ids[0] != null && vertexById.containsKey(ids[0])) { firstVertex = vertexById.get(ids[0]); } else { firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes()); addVertex(firstVertex); if (ids[0] != null) { vertexById.put(ids[0], firstVertex); } } if (!seqs[0].isEmpty() && ((isSource && !firstVertex.getSequenceString().equals(seqs[0])) || (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0]))) { throw new IllegalArgumentException("mismatched first element sequence"); } MultiDeBruijnVertex lastVertex = firstVertex; for (int i = 1; i < elements.length; i++) { //TODO: code and comment disagree Utils.validateArg(seqs[i].length() <= 1, "non-source vertex sequence must have length 1"); final MultiDeBruijnVertex nextVertex; if (ids[i] == null || !vertexById.containsKey(ids[i])) { final Set nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]); if (nextVertices.isEmpty()) { nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0])); addVertex(nextVertex); } else { nextVertex = nextVertices.iterator().next(); } if (ids[i] != null) { vertexById.put(ids[i], nextVertex); } } else { nextVertex = vertexById.get(ids[i]); } final MultiSampleEdge edge = addEdge(lastVertex,nextVertex); if (isReference) { edge.setIsRef(true); } lastVertex = nextVertex; } } } /** * Return the collection of outgoing vertices that expand this vertex with a particular base. * * @param v original vertex. * @param b expanding base. * @return never null, but perhaps an empty set. You cannot assume that you can modify the result. */ @VisibleForTesting Set getNextVertices(final MultiDeBruijnVertex v, final byte b) { Utils.nonNull(v, "the input vertex cannot be null"); Utils.validateArg(vertexSet().contains(v), "the vertex must be present in the graph"); final List result = new LinkedList<>(); for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) { if (w.getSuffix() == b) { result.add(w); } } switch (result.size()) { case 0: return Collections.emptySet(); case 1: return Collections.singleton(result.get(0)); default: return new HashSet<>(result); } } private static String pathElementId(final String element) { final int openBracketPosition = element.indexOf('('); if (openBracketPosition == -1) { return null; } final int closeBracketPosition = element.lastIndexOf(')'); Utils.validateArg(closeBracketPosition != -1, () -> "non-closed id parantesys found in element: " + element); final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim(); Utils.validateArg(!result.isEmpty(), () -> "empty id found in element: " + element); return result; } /** * Returns the lenght of a path element in the string representation. * @param element the query element. * @return 0 or greater. */ private static String pathElementSeq(final String element) { final int parentesysPos = element.indexOf('('); if (parentesysPos == -1) { return element.trim(); } return element.substring(0,parentesysPos).trim(); } /** * Add a base to the end of a byte sequence. * @param sequence sequence where to add the base to. * @param b base to add. * @return never {@code null}, a new array each time. */ private static byte[] extendSequence(final byte[] sequence, final byte b) { final byte[] result = new byte[sequence.length]; System.arraycopy(sequence, 1, result, 0, sequence.length - 1); result[result.length - 1] = b; return result; } @Override public TestingReadThreadingGraph clone() { return (TestingReadThreadingGraph) super.clone(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy