All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.LocalAssembler Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.PairWalker;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.collections.HopscotchSet;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPOutputStream;

@DocumentedFeature
@BetaFeature
@CommandLineProgramProperties(
        summary = "Performs local assembly of small regions to discover structural variants.",
        oneLineSummary = "Local assembler for SVs",
        usageExample = "gatk LocalAssembler -L chr21:16187360-16187360 --ip 500 -R 38.fa.gz " +
                "-I NA19240.cram -I NA19240.distantmate.bam " +
                "--assembly-name chr21_16187360_16187360_INS --gfa-file test.gfa --fasta-file test.fa.gz",
        programGroup = CoverageAnalysisProgramGroup.class
)
public class LocalAssembler extends PairWalker {
    @Argument(fullName="assembly-name", doc="Name of assembly used as a prefix for traversal names.")
    public String assemblyName;

    @Argument(fullName="gfa-file", doc="Path to assembly output in gfa format.", optional=true)
    public GATKPath gfaFile;

    @Argument(fullName="fasta-file", doc="Path to scaffolds in fasta format.", optional=true)
    public GATKPath fastaFile;

    public static final byte QMIN_DEFAULT = 25;
    @Argument(fullName="q-min", doc="Minimum base quality when kmerizing reads.", optional=true)
    private byte qMin = QMIN_DEFAULT;

    public static final int MIN_THIN_OBS_DEFAULT = 4;
    @Argument(fullName="min-thin-observations",
            doc="Minimum number of observations of some kmer within the contig required to " +
                    "retain the contig.", optional=true)
    private int minThinObs = MIN_THIN_OBS_DEFAULT;

    public static final int MIN_GAPFILL_COUNT_DEFAULT = 3;
    @Argument(fullName="min-gapfill-count",
            doc="Minimum number of observations of a sequence that patches a gap.", optional=true)
    private int minGapfillCount = MIN_GAPFILL_COUNT_DEFAULT;

    public static final int TOO_MANY_TRAVERSALS_DEFAULT = 100000;
    @Argument(fullName="too-many-traversals",
            doc="If the assembly graph produces this many traversals, just emit contigs instead.",
            optional=true)
    private int tooManyTraversals = TOO_MANY_TRAVERSALS_DEFAULT;

    public static final int TOO_MANY_SCAFFOLDS_DEFAULT = 50000;
    @Argument(fullName="too-many-scaffolds",
            doc="If the assembly graph produces this many scaffolds, just emit traversals instead.",
            optional=true)
    private int tooManyScaffolds = TOO_MANY_SCAFFOLDS_DEFAULT;

    public static final int MIN_SV_SIZE_DEFAULT = 50;
    @Argument(fullName="min-sv-size",
            doc="Smallest variation size to count as a structural variant.", optional=true)
    public int minSVSize = MIN_SV_SIZE_DEFAULT;

    @Argument(fullName="no-scaffolding", doc="turn off scaffolding -- write traversals instead", optional=true)
    private boolean noScaffolding = false;

    private final List reads = new ArrayList<>();

    @Override public boolean requiresIntervals() { return true; }

    @Override public void apply( final GATKRead read, final GATKRead mate ) {
        trimOverruns(read, mate);
        reads.add(read);
        reads.add(mate);
    }

    @Override public void applyUnpaired( final GATKRead read ) {
        reads.add(read);
    }

    @Override public Object onTraversalSuccess() {
        super.onTraversalSuccess(); // flush any incomplete pairs

        if ( gfaFile == null ) {
            gfaFile = new GATKPath(assemblyName + ".gfa.gz");
        }
        if ( fastaFile == null ) {
            fastaFile = new GATKPath(assemblyName + ".fa.gz");
        }

        final int regionSize = getTraversalIntervals().stream().mapToInt(SimpleInterval::size).sum();
        final KmerSet kmerAdjacencySet = new KmerSet<>(10 * regionSize);
        kmerizeReads(reads, qMin, kmerAdjacencySet);

        List contigs = createAssembly(kmerAdjacencySet, minThinObs);
        if ( fillGaps(kmerAdjacencySet, minGapfillCount, reads) ) {
            contigs = createAssembly(kmerAdjacencySet, minThinObs);
        }

        markCycles(contigs);

        final List readPaths = pathReads(kmerAdjacencySet, reads);
        final Map> contigTransitsMap =
                collectTransitPairCounts(contigs, readPaths);
        try {
            final List allTraversals = new ArrayList<>(
                    traverseAllPaths(contigs, readPaths, tooManyTraversals, contigTransitsMap));
            contigs.sort(Comparator.comparingInt(ContigImpl::getId));
            writeGFA(gfaFile, contigs, allTraversals);
            if ( noScaffolding ) {
                writeTraversals(fastaFile, assemblyName, allTraversals);
                return null;
            }
            try {
                writeTraversals(fastaFile, assemblyName,
                        createScaffolds(allTraversals, tooManyScaffolds, minSVSize));
            } catch ( final AssemblyTooComplexException x ) {
                logger.warn("Assembly too complex for scaffolding. Writing traversals to fasta-file");
                writeTraversals(fastaFile, assemblyName, allTraversals);
            }
        } catch ( final AssemblyTooComplexException x ) {
            logger.warn("Assembly too complex to traverse.  Writing contigs as traversals to fasta-file");
            final Collection contigTraversals = new ArrayList<>(contigs.size());
            for ( final Contig contig : contigs ) {
                contigTraversals.add(new Traversal(Collections.singletonList(contig)));
            }
            writeTraversals(fastaFile, assemblyName, contigTraversals);
        }
        return null;
    }

    private static List createAssembly( final KmerSet kmerAdjacencySet,
                                                    final int minThinObs ) {
        final List contigs = buildContigs(kmerAdjacencySet);
        connectContigs(contigs);
        removeThinContigs(contigs, minThinObs, kmerAdjacencySet);
        weldPipes(contigs);
        return contigs;
    }

    /** trim read pairs of base calls that have gone past the end of a short fragment */
    @VisibleForTesting
    static void trimOverruns( final GATKRead read, final GATKRead mate ) {
        // if both mapped and they're on different strands
        if ( !read.isUnmapped() && !mate.isUnmapped() &&
                read.isReverseStrand() != mate.isReverseStrand() ) {
            // and both start within 1 base on the ref
            if ( Math.abs(read.getStart() - read.getMateStart()) <= 1 ) {
                // and both end within 1 base
                final int readRefLen = read.getCigar().getReferenceLength();
                final int mateRefLen = mate.getCigar().getReferenceLength();
                if ( Math.abs(readRefLen - mateRefLen) <= 1 ) {
                    if ( mate.isReverseStrand() ) {
                        trimClips(read, mate);
                    } else {
                        trimClips(mate, read);
                    }
                }
            }
        }
    }

    private static void trimClips( final GATKRead fwd, final GATKRead rev ) {
        final List fwdElements = fwd.getCigarElements();
        final List revElements = rev.getCigarElements();
        final int lastFwdElementIdx = fwdElements.size() - 1;
        final int lastRevElementIdx = revElements.size() - 1;
        final CigarElement fwdLastElement = fwdElements.get(lastFwdElementIdx);
        final CigarElement revLastElement = revElements.get(lastRevElementIdx);
        final CigarElement fwdFirstElement = fwdElements.get(0);
        final CigarElement revFirstElement = revElements.get(0);
        if ( fwdFirstElement.getOperator() == CigarOperator.M &&
                fwdLastElement.getOperator() == CigarOperator.S &&
                revFirstElement.getOperator() == CigarOperator.S &&
                revLastElement.getOperator() == CigarOperator.M ) {
            final byte[] fwdBases = fwd.getBasesNoCopy();
            final int lastElementLen = fwdLastElement.getLength();
            fwd.setBases(Arrays.copyOfRange(fwdBases, 0, fwdBases.length - lastElementLen));
            final byte[] fwdQuals = fwd.getBaseQualitiesNoCopy();
            if ( fwdQuals.length > 0 ) {
                final int qualsLen = fwdQuals.length - lastElementLen;
                fwd.setBaseQualities(Arrays.copyOfRange(fwdQuals, 0, qualsLen));
            }
            final List newFwdElements = new ArrayList<>(fwdElements);
            newFwdElements.set(lastFwdElementIdx, new CigarElement(lastElementLen, CigarOperator.H));
            fwd.setCigar(new Cigar(newFwdElements));

            final byte[] revBases = rev.getBasesNoCopy();
            final int firstElementLen = revFirstElement.getLength();
            rev.setBases(Arrays.copyOfRange(revBases, firstElementLen, revBases.length));
            final byte[] revQuals = rev.getBaseQualitiesNoCopy();
            if ( revQuals.length > 0 ) {
                rev.setBaseQualities(Arrays.copyOfRange(revQuals, firstElementLen, revQuals.length));
            }
            final List newRevElements = new ArrayList<>(revElements);
            newRevElements.set(0, new CigarElement(firstElementLen, CigarOperator.H));
            rev.setCigar(new Cigar(newRevElements));
        }
    }

    @VisibleForTesting
    static void kmerizeReads( final List reads,
                              final byte qMin,
                              final KmerSet kmerAdjacencySet ) {
        for ( final GATKRead read : reads ) {
            final byte[] calls = read.getBasesNoCopy();
            final byte[] quals = read.getBaseQualitiesNoCopy();
            KmerAdjacency.kmerize(calls, quals, qMin, kmerAdjacencySet);
        }
    }

    /** gather unbranched strings of kmers into contigs */
    @VisibleForTesting
    static List buildContigs( final KmerSet kmerAdjacencySet ) {
        // This method identifies each KmerAdjacency that is a contig start or end, and then builds
        // a contig from that start or end. That's actually a lie: it builds contigs at the starts,
        // and builds contigs from the reverse complement of the ends (because that's the start of a
        // contig on the other strand) to economize on code.
        // A KmerAdjacency is a contig start if:
        // 1) it has more than 1 predecessor, or no predecessors
        // 2) it has a single predecessor, but that predecessor has multiple successors
        // 3) its predecessor is its own reverse complement (i.e., a palindromic hairpin)
        // Similarly, a KmerAdjacency is a contig end if:
        // 1) it has more than 1 successor, or no successors
        // 2) it has a single successor, but that successor has multiple predecessors
        // 3) its successor is its own reverse complement
        final List contigs = new ArrayList<>();
        int nContigs = 0;
        for ( final KmerAdjacency kmerAdjacency : kmerAdjacencySet ) {
            if ( kmerAdjacency.getContig() == null ) {
                ContigImpl contig = null;
                final KmerAdjacency predecessor = kmerAdjacency.getSolePredecessor();
                // if we should start a contig with this kmer
                if ( predecessor == null || // yes, no predecessors or more than 1
                        predecessor.getSuccessorCount() > 1 || // yes, predecessor has multiple successors
                        predecessor == kmerAdjacency.rc() ) { // yes, predecessor folds back as a palindrome
                    contig = new ContigImpl(++nContigs, kmerAdjacency);
                } else {
                    // if we should end a contig with this kmer (actually we'll start a contig with the RC)
                    final KmerAdjacency successor = kmerAdjacency.getSoleSuccessor();
                    if ( successor == null || // yes, no successors or more than 1
                            successor.getPredecessorCount() > 1 || // yes, successor has multiple predecessors
                            successor == kmerAdjacency.rc() ) { // yes, successor folds back as a palindrome
                        contig = new ContigImpl(++nContigs, kmerAdjacency.rc());
                    }
                }
                if ( contig != null ) {
                    setKmerContig(contig);
                    contigs.add(contig);
                }
            }
        }

        // if there are smooth circles like a plasmid, gather them together as a contig, too
        for ( final KmerAdjacency kmerAdjacency : kmerAdjacencySet ) {
            if ( kmerAdjacency.getContig() == null ) {
                final ContigImpl contig = new ContigImpl(++nContigs, kmerAdjacency);
                setKmerContig(contig);
                contigs.add(contig);
            }
        }

        return contigs;
    }

    /** connect contigs when the final kmer of one contig is adjacent to the inital contig of another */
    @VisibleForTesting
    static void connectContigs( final List contigs ) {
        final int nContigs = contigs.size();
        final KmerSet contigEnds = new KmerSet<>(2*nContigs);
        for ( int contigId = 0; contigId != nContigs; ++contigId ) {
            final ContigImpl contig = contigs.get(contigId);
            final KmerAdjacency fwdKmer = contig.getFirstKmer();
            final KmerAdjacency revKmer = contig.getLastKmer().rc();
            if ( fwdKmer == revKmer ) {
                contigEnds.add(new ContigEndKmer(fwdKmer.getKVal(), contig, ContigOrientation.BOTH));
            } else {
                contigEnds.add(new ContigEndKmer(fwdKmer.getKVal(), contig, ContigOrientation.FWD));
                contigEnds.add(new ContigEndKmer(revKmer.getKVal(), contig, ContigOrientation.REV));
            }
        }

        for ( int contigId = 0; contigId != nContigs; ++contigId ) {
            final Contig contig = contigs.get(contigId);

            final KmerAdjacency start = contig.getFirstKmer();
            final int predecessorCount = start.getPredecessorCount();
            if ( predecessorCount > 0 ) {
                final List predecessors = contig.getPredecessors();
                final int mask = start.getPredecessorMask();
                for ( int call = 0; call != 4; ++call ) {
                    if ( (mask & (1 << call)) != 0 ) {
                        final long kVal =
                                KmerAdjacency.reverseComplement(start.getPredecessorVal(call));
                        final ContigEndKmer contigEndKmer = contigEnds.find(new Kmer(kVal));
                        if ( contigEndKmer == null ) {
                            throw new GATKException("missing contig end kmer");
                        }
                        switch ( contigEndKmer.getContigOrientation() ) {
                            case FWD:
                                predecessors.add(contigEndKmer.getContig().rc());
                                break;
                            case REV:
                                predecessors.add(contigEndKmer.getContig());
                                break;
                            case BOTH:
                                predecessors.add(contigEndKmer.getContig());
                                predecessors.add(contigEndKmer.getContig().rc());
                                break;
                        }
                    }
                }
            }

            final KmerAdjacency end = contig.getLastKmer();
            final int successorCount = end.getSuccessorCount();
            if ( successorCount > 0 ) {
                final List successors = contig.getSuccessors();
                final int mask = end.getSuccessorMask();
                for ( int call = 0; call != 4; ++call ) {
                    if ( (mask & (1 << call)) != 0 ) {
                        final long kVal = end.getSuccessorVal(call);
                        final ContigEndKmer contigEndKmer = contigEnds.find(new Kmer(kVal));
                        if ( contigEndKmer == null ) {
                            throw new GATKException("missing contig end kmer");
                        }
                        switch ( contigEndKmer.getContigOrientation() ) {
                            case FWD:
                                successors.add(contigEndKmer.getContig());
                                break;
                            case REV:
                                successors.add(contigEndKmer.getContig().rc());
                                break;
                            case BOTH:
                                successors.add(contigEndKmer.getContig());
                                successors.add(contigEndKmer.getContig().rc());
                                break;
                        }
                    }
                }
            }
        }
    }

    /** remove contigs that have little evidence */
    @VisibleForTesting
    static void removeThinContigs( final List contigs,
                                   final int minThinObs,
                                   final KmerSet kmerAdjacencySet ) {
        contigs.sort(Comparator.comparingInt(ContigImpl::getMaxObservations));
        boolean contigRemoved;
        final WalkDataFactory walkDataFactory = new WalkDataFactory();
        do {
            // figure out which contigs are cut points
            // i.e., those contigs which, if removed, would result in a graph with more connected components
            final int nContigs = contigs.size();
            final Map cutDataMap = new HashMap<>(nContigs * 3);

            for ( final ContigImpl contig : contigs ) {
                if ( cutDataMap.containsKey(contig) ) {
                    continue;
                }

                cutDataMap.put(contig, walkDataFactory.create());
                int children = 0;
                for ( final Contig nextContig : contig.getSuccessors() ) {
                    if ( !cutDataMap.containsKey(nextContig) ) {
                        findCuts(nextContig, contig, walkDataFactory, cutDataMap);
                        children += 1;
                    }
                }
                for ( final Contig nextContig : contig.getPredecessors() ) {
                    if ( !cutDataMap.containsKey(nextContig) ) {
                        findCuts(nextContig, contig, walkDataFactory, cutDataMap);
                        children += 1;
                    }
                }
                if ( children >= 2 ) {
                    contig.setMarked(true);
                }
            }

            // remove poorly attested (low max observations) contigs, unless they are cut points
            contigRemoved = false;
            final Iterator itr = contigs.iterator();
            while ( itr.hasNext() ) {
                final Contig contig = itr.next();
                // TODO: Think about replacing the heuristic "minThinObs" with something that
                //       takes the observation depth of adjacent contigs into account.
                if ( contig.getMaxObservations() < minThinObs && !contig.isMarked() ) {
                    unlinkContig(contig, kmerAdjacencySet);
                    itr.remove();
                    contigRemoved = true;
                    break;
                }
            }
        } while ( contigRemoved );
        contigs.sort(Comparator.comparingInt(ContigImpl::getId));
    }

    private static WalkData findCuts( final Contig contig,
                                      final Contig parent,
                                      final WalkDataFactory walkDataFactory,
                                      final Map cutDataMap ) {
        final WalkData walkData = walkDataFactory.create();
        cutDataMap.put(contig, walkData);
        for ( final Contig nextContig : contig.getSuccessors() ) {
            if ( nextContig == parent ) {
                continue;
            }
            WalkData nextWalkData = cutDataMap.get(nextContig);
            if ( nextWalkData != null ) {
                walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.visitNum);
            } else {
                nextWalkData = findCuts(nextContig, contig, walkDataFactory, cutDataMap);
                walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.minVisitNum);
                if ( nextWalkData.minVisitNum >= walkData.visitNum ) {
                    contig.setMarked(true);
                }
            }
        }
        for ( final Contig nextContig : contig.getPredecessors() ) {
            if ( nextContig == parent ) {
                continue;
            }
            WalkData nextWalkData = cutDataMap.get(nextContig);
            if ( nextWalkData != null ) {
                walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.visitNum);
            } else {
                nextWalkData = findCuts(nextContig, contig, walkDataFactory, cutDataMap);
                walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.minVisitNum);
                if ( nextWalkData.minVisitNum >= walkData.visitNum ) {
                    contig.setMarked(true);
                }
            }
        }
        return walkData;
    }

    @VisibleForTesting
    static void unlinkContig( final Contig contig, final KmerSet kmerAdjacencySet ) {
        final KmerAdjacency firstKmer = contig.getFirstKmer();
        final int firstKmerFinalCall = firstKmer.getFinalCall();
        for ( final Contig predecessor : contig.getPredecessors() ) {
            if ( predecessor != contig && predecessor != contig.rc() ) {
                predecessor.getLastKmer().removeSuccessor(firstKmerFinalCall, kmerAdjacencySet);
                if ( !predecessor.getSuccessors().remove(contig) ) {
                    throw new GATKException("failed to find predecessor link");
                }
            }
        }

        final KmerAdjacency lastKmer = contig.getLastKmer();
        final int lastKmerInitialCall = lastKmer.getInitialCall();
        for ( final Contig successor : contig.getSuccessors() ) {
            if ( successor != contig && successor != contig.rc() ) {
                successor.getFirstKmer().removePredecessor(lastKmerInitialCall, kmerAdjacencySet);
                if ( !successor.getPredecessors().remove(contig) ) {
                    throw new GATKException("failed to find successor link");
                }
            }
        }

        KmerAdjacency nextKmer = firstKmer;
        KmerAdjacency kmer;
        do {
            kmer = nextKmer;
            nextKmer = kmer.getSoleSuccessor();
            kmerAdjacencySet.remove(kmer.canonical());
        } while ( kmer != lastKmer );
    }

    private static void clearKmerContig( final Contig contig ) {
        int count = 0;
        final KmerAdjacency firstKmer = contig.getFirstKmer();
        final KmerAdjacency lastKmer = contig.getLastKmer();
        for ( KmerAdjacency kmer = firstKmer; kmer != lastKmer; kmer = kmer.getSoleSuccessor() ) {
            if ( kmer == null ) {
                throw new GATKException("contig does not have a flat pipeline of kmers");
            }
            if ( kmer.getContig() == null ) {
                throw new GATKException("we've returned to a kmer we've already cleared");
            }
            kmer.clearContig();
            count += 1;
        }
        lastKmer.clearContig();
        if ( count + Kmer.KSIZE != contig.size() ) {
            throw new GATKException("kmer chain length does not equal contig size");
        }
    }

    /** Sets a pointer back to its unique containing contig for each kmer comprised by the contig */
    private static void setKmerContig( final Contig contig ) {
        int offset = 0;
        final KmerAdjacency firstKmer = contig.getFirstKmer();
        final KmerAdjacency lastKmer = contig.getLastKmer();
        for ( KmerAdjacency kmer = firstKmer; kmer != lastKmer; kmer = kmer.getSoleSuccessor() ) {
            if ( kmer == null ) {
                throw new GATKException("contig does not have a flat pipeline of kmers");
            }
            if ( kmer.getContig() != null ) {
                throw new GATKException("we've returned to a kmer we've already updated");
            }
            kmer.setContigOffset(contig, offset++);
        }
        lastKmer.setContigOffset(contig, offset);
        if ( offset + Kmer.KSIZE != contig.size() ) {
            throw new GATKException("kmer chain length does not equal contig size");
        }
    }

    /** replace adjacent contigs without branches with a single, larger contig */
    @VisibleForTesting
    static void weldPipes( final List contigs ) {
        for ( int contigIdx = 0; contigIdx != contigs.size(); ++contigIdx ) {
            final ContigImpl contig = contigs.get(contigIdx);
            if ( contig.getSuccessors().size() == 1 ) {
                final Contig successor = contig.getSuccessors().get(0);
                if ( successor != contig && successor != contig.rc() &&
                        successor.getPredecessors().size() == 1 ) {
                    contigs.set(contigIdx, join(contig.getId(), contig, successor));
                    if ( !contigs.remove(successor.canonical()) ) {
                        throw new GATKException("successor linkage is messed up");
                    }
                    contigIdx -= 1; // reconsider the new contig -- there might be more joining possible
                    continue;
                }
            }
            if ( contig.getPredecessors().size() == 1 ) {
                final Contig predecessor = contig.getPredecessors().get(0);
                if ( predecessor != contig && predecessor != contig.rc() &&
                        predecessor.getSuccessors().size() == 1 ) {
                    contigs.set(contigIdx, join(contig.getId(), predecessor, contig));
                    if ( !contigs.remove(predecessor.canonical()) ) {
                        throw new GATKException("predecessor linkage is messed up");
                    }
                    contigIdx -= 1; // reconsider
                }
            }
        }
    }

    private static ContigImpl join( final int id,
                                    final Contig predecessor,
                                    final Contig successor ) {
        final ContigImpl joinedContig = new ContigImpl(id, predecessor, successor);
        clearKmerContig(joinedContig);
        setKmerContig(joinedContig);
        return joinedContig;
    }

    @VisibleForTesting
    static void markCycles( final List contigs ) {
        for ( final Contig contig : contigs ) {
            contig.setIsCycleMember(false);
        }

        final int nContigs = contigs.size();
        final Deque deque = new ArrayDeque<>(nContigs);
        final Map cutDataMap = new HashMap<>(nContigs * 3);
        final WalkDataFactory walkDataFactory = new WalkDataFactory();
        for ( final Contig contig : contigs ) {
            if ( !cutDataMap.containsKey(contig) ) {
                markCyclesRecursion(contig, deque, walkDataFactory, cutDataMap);
            }
        }
    }

    private static WalkData markCyclesRecursion( final Contig contig,
                                                 final Deque deque,
                                                 final WalkDataFactory walkDataFactory,
                                                 final Map cutDataMap ) {
        final WalkData walkData = walkDataFactory.create();
        cutDataMap.put(contig, walkData);
        deque.addFirst(contig);

        for ( final Contig successor : contig.getSuccessors() ) {
            final WalkData successorWalkData = cutDataMap.get(successor);
            if ( successorWalkData == null ) {
                final int recursionVisitNum =
                        markCyclesRecursion(successor, deque, walkDataFactory, cutDataMap).minVisitNum;
                walkData.minVisitNum = Math.min(walkData.minVisitNum, recursionVisitNum);
            } else {
                walkData.minVisitNum = Math.min(walkData.minVisitNum, successorWalkData.visitNum);
            }
        }

        if ( walkData.visitNum == walkData.minVisitNum ) {
            Contig tig = deque.removeFirst();
            if ( tig == contig ) {
                cutDataMap.get(tig).visitNum = Integer.MAX_VALUE;

                // single-vertex component -- cyclic only if self-referential
                if ( tig.getSuccessors().contains(tig) ) {
                    tig.setIsCycleMember(true);
                }
            } else {
                while ( true ) {
                    // kill cross-links
                    cutDataMap.get(tig).visitNum = Integer.MAX_VALUE;
                    tig.setIsCycleMember(true);
                    if ( tig == contig ) break;
                    tig = deque.removeFirst();
                }
            }
        }
        return walkData;
    }

    @VisibleForTesting
    static boolean fillGaps( final KmerSet kmerAdjacencySet,
                             final int minGapfillCount,
                             final List reads ) {
        final Map gapFillCounts = new HashMap<>();
        final PathBuilder pathBuilder = new PathBuilder(kmerAdjacencySet);
        for ( final GATKRead read : reads ) {
            final Path path = new Path(read.getBasesNoCopy(), pathBuilder);
            final List parts = path.getParts();
            final int lastIdx = parts.size() - 1;
            for ( int idx = 1; idx < lastIdx; ++idx ) {
                final PathPart pathPart = parts.get(idx);
                if ( pathPart.isGap() ) {
                    final char prevCall = parts.get(idx - 1).getLastCall();
                    final char nextCall = parts.get(idx + 1).getFirstCall();
                    String gapFill = prevCall + pathPart.getSequence().toString() + nextCall;
                    final SequenceRC gapFillRC = new SequenceRC(gapFill);
                    if ( gapFillRC.compareTo(gapFill) < 0 ) {
                        gapFill = gapFillRC.toString();
                    }
                    gapFillCounts.merge(gapFill, 1, Integer::sum);
                }
            }
        }

        boolean newKmers = false;
        for ( final Map.Entry entry : gapFillCounts.entrySet() ) {
            final int nObservations = entry.getValue();
            if ( nObservations >= minGapfillCount ) {
                KmerAdjacency.kmerize(entry.getKey(), nObservations, kmerAdjacencySet);
                newKmers = true;
            }
        }

        if ( newKmers ) {
            for ( final KmerAdjacency kmerAdjacency : kmerAdjacencySet ) {
                kmerAdjacency.clearContig();
            }
        }
        return newKmers;
    }

    @VisibleForTesting
    static List pathReads( final KmerSet kmerAdjacencySet,
                          final List reads ) {
        final List readPaths = new ArrayList<>(reads.size());
        final PathBuilder pathBuilder = new PathBuilder(kmerAdjacencySet);
        for ( final GATKRead read : reads ) {
            readPaths.add(new Path(read.getBasesNoCopy(), pathBuilder));
        }
        return readPaths;
    }

    @VisibleForTesting
    static Map> collectTransitPairCounts(
            final List contigs,
            final List readPaths ) {
        final Map> contigTransitsMap =
                new HashMap<>(3 * contigs.size());
        for ( final Path path : readPaths ) {
            final List parts = path.getParts();
            final int lastPart = parts.size() - 1;
            for ( int partIdx = 1; partIdx < lastPart; ++partIdx ) {
                final Contig prevContig = parts.get(partIdx - 1).getContig();
                if ( prevContig == null ) continue;
                final Contig curContig = parts.get(partIdx).getContig();
                if ( curContig == null ) {
                    partIdx += 1;
                    continue;
                }
                final Contig nextContig = parts.get(partIdx + 1).getContig();
                if ( nextContig == null ) {
                    partIdx += 2;
                    continue;
                }
                final TransitPairCount tpc = new TransitPairCount(prevContig, nextContig);
                final List tpcList =
                        contigTransitsMap.computeIfAbsent(curContig, tig -> new ArrayList<>(4));
                final int idx = tpcList.indexOf(tpc);
                if ( idx != -1 ) {
                    tpcList.get(idx).observe();
                } else {
                    tpcList.add(tpc);
                    contigTransitsMap.computeIfAbsent(curContig.rc(), tig -> new ArrayList<>(4))
                            .add(tpc.getRC());
                }
            }
        }
        return contigTransitsMap;
    }

    @VisibleForTesting
    static Set traverseAllPaths(
            final List contigs,
            final List readPaths,
            final int tooManyTraversals,
            final Map> contigTransitsMap ) {
        final TraversalSet traversalSet = new TraversalSet(tooManyTraversals);
        final List contigsList = new ArrayList<>();
        // build traversals from untransited contigs
        // untransited contigs are sources, sinks, or large contigs that can't be crossed by a read
        for ( final Contig contig : contigs ) {
            if ( !contigTransitsMap.containsKey(contig) ) {
                if ( contig.getSuccessors().isEmpty() && contig.getPredecessors().isEmpty() ) {
                    traversalSet.add(new Traversal(Collections.singletonList(contig)));
                } else {
                    for ( final Contig successor : contig.getSuccessors() ) {
                        traverse(successor, contig, contigsList,
                                    readPaths, contigTransitsMap, traversalSet);
                    }
                    for ( final Contig predecessor : contig.getPredecessors() ) {
                        traverse(predecessor.rc(), contig.rc(), contigsList,
                                    readPaths, contigTransitsMap, traversalSet);
                    }
                }
            }
        }

        // build traversals across acyclic contigs with transits that haven't been used yet
        for ( final Map.Entry> entry :
                contigTransitsMap.entrySet() ) {
            final Contig contig = entry.getKey();
            if ( contig.isCycleMember() ) {
                continue;
            }
            for ( final TransitPairCount tpc : entry.getValue() ) {
                if ( tpc.getCount() > 0 ) {
                    tpc.resetCount();
                    final TraversalSet fwdTraversalSet = new TraversalSet(tooManyTraversals);
                    traverse(tpc.getNextContig(), contig, contigsList,
                            readPaths, contigTransitsMap, fwdTraversalSet);
                    final TraversalSet revTraversalSet = new TraversalSet(tooManyTraversals);
                    traverse(tpc.getPrevContig().rc(), contig.rc(), contigsList,
                            readPaths, contigTransitsMap, revTraversalSet);
                    for ( final Traversal revTraversal : revTraversalSet ) {
                        final Traversal revTraversalRC = revTraversal.rc();
                        for ( final Traversal fwdTraversal : fwdTraversalSet ) {
                            traversalSet.add(Traversal.combine(revTraversalRC, fwdTraversal));
                        }
                    }
                }
            }
        }

        // build traversals from any remaining unexplored transits
        for ( final Map.Entry> entry :
                contigTransitsMap.entrySet() ) {
            final Contig contig = entry.getKey();
            for ( final TransitPairCount tpc : entry.getValue() ) {
                if ( tpc.getCount() > 0 ) {
                    tpc.resetCount();
                    final int lastElement = contigsList.size();
                    contigsList.add(tpc.getPrevContig());
                    traverse(tpc.getNextContig(), contig, contigsList,
                            readPaths, contigTransitsMap, traversalSet);
                    contigsList.set(lastElement, tpc.getNextContig().rc());
                    traverse(tpc.getPrevContig().rc(), contig.rc(), contigsList,
                            readPaths, contigTransitsMap, traversalSet);
                    contigsList.remove(lastElement);
                }
            }
        }

        return traversalSet;
    }

    private static void traverse( final Contig contig,
                                  final Contig predecessor,
                                  final List contigsList,
                                  final List readPaths,
                                  final Map> contigTransitsMap,
                                  final TraversalSet traversalSet ) {
        contigsList.add(predecessor); // extend the contigsList
        if ( contig.isCycleMember() ) {
            traverseCycle(contig, contigsList, readPaths, contigTransitsMap, traversalSet);
            // about to return, remove element added 3 lines above
            contigsList.remove(contigsList.size() - 1);
            return;
        }
        final List transits = contigTransitsMap.get(contig);
        boolean foundTransitInMap = false;
        if ( transits != null ) {
            for ( final TransitPairCount tpc : transits ) {
                if ( tpc.getPrevContig() == predecessor ) {
                    final Contig successor = tpc.getNextContig();
                    if ( predecessor == contig.rc() ) { // if palindromic
                        final int nContigs = contigsList.size();
                        if ( nContigs > 1 ) {
                            if ( successor.rc() == contigsList.get(nContigs - 2) ) {
                                continue;
                            }
                        }
                    }
                    tpc.resetCount();
                    traverse(successor, contig, contigsList,
                                readPaths, contigTransitsMap, traversalSet);
                    foundTransitInMap = true;
                }
            }
        }
        if ( !foundTransitInMap ) {
            contigsList.add(contig);
            traversalSet.add(new Traversal(contigsList));
            contigsList.remove(contigsList.size() - 1);
        }

        // undo what we did on the 1st line of this method
        contigsList.remove(contigsList.size() - 1);
    }

    private static void traverseCycle( final Contig contig,
                                       final List contigsList,
                                       final List readPaths,
                                       final Map> contigTransitsMap,
                                       final TraversalSet traversalSet ) {
        contigsList.add(contig);
        final int nContigs = contigsList.size();

        // We're here because the final element of the list is cyclic.
        // The previous element, if it exists, is a good place from which to start figuring out how
        // far the read paths lead us.
        if ( !contig.isCycleMember() ) {
            throw new GATKException("not called from a cyclic contig");
        }
        final List contigsSubList =
                nContigs <= 2 ? contigsList : contigsList.subList(nContigs - 2, nContigs);
        final List> longestPaths = findLongestPaths(contigsSubList, readPaths);
        // didn't get anywhere -- just complete the traversal
        if ( longestPaths.isEmpty() ) {
            traversalSet.add(new Traversal(contigsList));
        } else {
            // for each unique extension into the cycle
            for ( final List path : longestPaths ) {
                // don't think this can happen, but still
                if ( path.isEmpty() ) {
                    traversalSet.add(new Traversal(contigsList));
                    continue;
                }
                final List extendedContigsList =
                        new ArrayList<>(contigsList.size() + path.size());
                extendedContigsList.addAll(contigsList);
                // if we didn't get out of the cycle
                if ( path.get(path.size() - 1).isCycleMember() ) {
                    extendedContigsList.addAll(path);
                    traversalSet.add(new Traversal(extendedContigsList));
                } else {
                    // we found a cycle-exiting path, so extend that normally
                    for ( final Contig curContig : path ) {
                        if ( curContig.isCycleMember() ) {
                            extendedContigsList.add(curContig);
                        } else {
                            final Contig prevContig =
                                    extendedContigsList.remove(extendedContigsList.size() - 1);
                            traverse(curContig, prevContig, extendedContigsList, readPaths,
                                        contigTransitsMap, traversalSet);
                            extendedContigsList.add(prevContig);
                            break;
                        }
                    }
                }
                clearTransitPairs(contigTransitsMap, extendedContigsList);
            }
        }
        contigsList.remove(contigsList.size() - 1);
    }

    private static void clearTransitPairs(
            final Map> contigTransitsMap,
            final List contigsList ) {
        final int lastIdx = contigsList.size() - 1;
        for ( int idx = 1; idx < lastIdx; ++idx ) {
            final List pairCounts = contigTransitsMap.get(contigsList.get(idx));
            if ( pairCounts != null ) {
                final Contig predecessor = contigsList.get(idx - 1);
                final Contig successor = contigsList.get(idx + 1);
                for ( final TransitPairCount tpc : pairCounts ) {
                    if ( tpc.getPrevContig() == predecessor && tpc.getNextContig() == successor ) {
                        tpc.resetCount();
                        break;
                    }
                }
            }
        }
    }

    private static List> findLongestPaths( final List toMatch,
                                                        final List readPaths ) {
        final List> longestPaths = new ArrayList<>();
        for ( final Path path : readPaths ) {
            testPath(path, toMatch, longestPaths);
            testPath(path.rc(), toMatch, longestPaths);
        }
        return longestPaths;
    }

    private static void testPath( final Path path,
                                  final List toMatch,
                                  final List> longestPaths ) {
        final List pathParts = path.getParts();
        final int nPathParts = pathParts.size();
        final List pathContigs =
                pathParts.stream()
                        .map(PathPart::getContig)
                        .collect(Collectors.toCollection(() -> new ArrayList<>(nPathParts)));
        final int matchIdx = Collections.indexOfSubList(pathContigs, toMatch);
        if ( matchIdx != -1 ) {
            final int suffixIdx = matchIdx + toMatch.size();
            if ( suffixIdx < nPathParts ) {
                addSubPathToLongestPaths(extractSubPath(pathContigs, suffixIdx), longestPaths);
            }
        }
    }

    private static List extractSubPath( final List pathContigs,
                                                final int suffixIdx ) {
        final int nPathContigs = pathContigs.size();
        Contig prev = pathContigs.get(suffixIdx - 1);
        final List subPath = new ArrayList<>(nPathContigs - suffixIdx);
        for ( int idx = suffixIdx; idx != nPathContigs; ++idx ) {
            final Contig tig = pathContigs.get(idx);
            if ( tig == null || !prev.getSuccessors().contains(tig) ) break;
            subPath.add(tig);
            prev = tig;
        }
        return subPath;
    }

    private static void addSubPathToLongestPaths( final List subPath,
                                                  final List> longestPaths ) {
        final int nResults = longestPaths.size();
        for ( int idx = 0; idx != nResults; ++idx ) {
            final List test = longestPaths.get(idx);
            if ( isPrefix(subPath, test) ) return;
            if ( isPrefix(test, subPath) ) {
                longestPaths.set(idx, subPath);
                return;
            }
        }
        longestPaths.add(subPath);
    }

    private static boolean isPrefix( final List list1, final List list2 ) {
        final int list1Size = list1.size();
        final int list2Size = list2.size();
        if ( list1Size > list2Size ) return false;
        for ( int idx = 0; idx != list1Size; ++idx ) {
            if ( list1.get(idx) != list2.get(idx) ) return false;
        }
        return true;
    }

    @VisibleForTesting
    static Collection createScaffolds( final List allTraversals,
                                                  final int tooManyScaffolds,
                                                  final int minSVSize ) {
        removeTriviallyDifferentTraversals(allTraversals, minSVSize);

        final int nTraversals = allTraversals.size();
        final Map> traversalsByFirstContig = new HashMap<>(3 * nTraversals);
        for ( int idx = 0; idx != nTraversals; ++idx ) {
            final Traversal traversal = allTraversals.get(idx);
            traversalsByFirstContig.compute(traversal.getFirstContig(),
                    ( k, v ) -> v == null ? new ArrayList<>(3) : v).add(idx);
            final Traversal rcTraversal = traversal.rc();
            traversalsByFirstContig.compute(rcTraversal.getFirstContig(),
                    ( k, v ) -> v == null ? new ArrayList<>(3) : v).add(~idx);
        }

        final List scaffolds = new ArrayList<>(nTraversals);
        final boolean[] touched = new boolean[nTraversals];
        for ( int idx = 0; idx != nTraversals; ++idx ) {
            if ( !touched[idx] ) {
                expandTraversal(idx, touched, traversalsByFirstContig, allTraversals,
                        tooManyScaffolds, scaffolds);
            }
        }
        return scaffolds;
    }

    private static void expandTraversal( final int traversalIdx,
                                         final boolean[] touched,
                                         final Map> traversalsByFirstContig,
                                         final List allTraversals,
                                         final int tooManyScaffolds,
                                         final List scaffolds ) {
        final Traversal traversal = allTraversals.get(traversalIdx);
        touched[traversalIdx] = true;
        final List downExtensions = new ArrayList<>();
        final Set startingContigSet = new HashSet<>();
        walkTraversals(traversal, touched, startingContigSet, traversalsByFirstContig,
                        allTraversals, downExtensions);
        final List upExtensions = new ArrayList<>();
        walkTraversals(traversal.rc(), touched, startingContigSet, traversalsByFirstContig,
                        allTraversals, upExtensions);
        for ( final Traversal down : downExtensions ) {
            for ( final Traversal up : upExtensions ) {
                if ( scaffolds.size() >= tooManyScaffolds ) {
                    throw new AssemblyTooComplexException();
                }
                scaffolds.add(
                        Traversal.combineOverlappers(up.rc(), down, traversal.getContigs().size()));
            }
        }
    }

    private static void walkTraversals( final Traversal traversal,
                                        final boolean[] touched,
                                        final Set startingContigSet,
                                        final Map> traversalsByFirstContig,
                                        final List allTraversals,
                                        final List extensions ) {
        final Contig firstContig = traversal.getFirstContig();
        final List indexList;
        if ( startingContigSet.contains(firstContig) ||
                traversal.isInextensible() ||
                (indexList = traversalsByFirstContig.get(traversal.getLastContig())) == null ) {
            extensions.add(traversal);
            return;
        }
        startingContigSet.add(firstContig);
        for ( int idx : indexList ) {
            final Traversal extension;
            if ( idx >= 0 ) {
                extension = allTraversals.get(idx);
                touched[idx] = true;
            } else {
                final int rcIdx = ~idx;
                extension = allTraversals.get(rcIdx).rc();
                touched[rcIdx] = true;
            }
            walkTraversals(Traversal.combine(traversal, extension), touched, startingContigSet,
                            traversalsByFirstContig, allTraversals, extensions );
        }
        startingContigSet.remove(firstContig);
    }

    private static void removeTriviallyDifferentTraversals(
                                            final Collection allTraversals,
                                            final int minSVSize ) {
        if ( allTraversals.isEmpty() ) {
            return;
        }
        final TreeSet sortedTraversals = new TreeSet<>(new TraversalEndpointComparator());
        for ( final Traversal traversal : allTraversals ) {
            sortedTraversals.add(traversal);
            sortedTraversals.add(traversal.rc());
        }
        final Iterator traversalIterator = sortedTraversals.iterator();
        Traversal prevTraversal = traversalIterator.next();
        while ( traversalIterator.hasNext() ) {
            final Traversal curTraversal = traversalIterator.next();
            if ( isTriviallyDifferent(prevTraversal, curTraversal, minSVSize) ) {
                traversalIterator.remove();
            } else {
                prevTraversal = curTraversal;
            }
        }
        // remove duplicates where we have both strands surviving
        final Iterator traversalIterator2 = sortedTraversals.iterator();
        while ( traversalIterator2.hasNext() ) {
            final Traversal traversal = traversalIterator2.next();
            if ( sortedTraversals.contains(traversal.rc()) ) {
                traversalIterator2.remove();
            }
        }
        allTraversals.clear();
        allTraversals.addAll(sortedTraversals);
    }

    private static boolean isTriviallyDifferent( final Traversal traversal1,
                                                 final Traversal traversal2,
                                                 final int minSVSize ) {
        final Contig firstContig1 = traversal1.getFirstContig();
        final Contig lastContig1 = traversal1.getLastContig();
        final Contig firstContig2 = traversal2.getFirstContig();
        final Contig lastContig2 = traversal2.getLastContig();
        if ( firstContig1 != firstContig2 || lastContig1 != lastContig2 ) {
            return false;
        }
        final int interiorSize1 =
                traversal1.getSequenceLength() - firstContig1.size() - lastContig1.size();
        final int interiorSize2 =
                traversal2.getSequenceLength() - firstContig2.size() - lastContig2.size();

        // if the path lengths are so different that one could harbor an SV, they're not trivially different
        if ( Math.abs(interiorSize1 - interiorSize2) >= minSVSize ) {
            return false;
        }

        // if the paths are small enough that there can't be an SV's worth of differences, they're trivially different
        final int maxInteriorSize = Math.max(interiorSize1, interiorSize2);
        if ( maxInteriorSize < minSVSize ) {
            return true;
        }

        // dang, maybe there's enough material in common that there can't be an SV's worth of differences
        // run a longest common subsequence algorithm to figure out the length of the common material
        // DP matrix holds length of common material
        final List contigs1 = traversal1.getContigs();
        final int rowLen = contigs1.size() - 1;
        final int[][] rowPair = new int[2][];
        rowPair[0] = new int[rowLen];
        rowPair[1] = new int[rowLen];
        int pairIdx = 0;
        final List contigs2 = traversal2.getContigs();
        final int nRows = contigs2.size() - 1;
        for ( int idx2 = 1; idx2 != nRows; ++idx2 ) {
            final int[] curRow = rowPair[pairIdx];
            final int[] prevRow = rowPair[pairIdx ^ 1];
            pairIdx ^= 1;

            final int id2 = contigs2.get(idx2).getId();
            for ( int idx1 = 1; idx1 != rowLen; ++idx1 ) {
                final Contig tig1 = contigs1.get(idx1);
                if ( tig1.getId() == id2 ) {
                    // if the previous cells also contain a match we've already removed the K-1 bases upstream
                    final boolean extendMatch =
                            contigs1.get(idx1 -1).getId() == contigs2.get(idx2 - 1).getId();
                    curRow[idx1] = prevRow[idx1 - 1] + (extendMatch ? tig1.getNKmers() : tig1.size());
                } else {
                    curRow[idx1] = Math.max(curRow[idx1 - 1], prevRow[idx1]);
                }
            }
        }
        final int commonLen = rowPair[pairIdx ^ 1][rowLen - 1];
        return (maxInteriorSize - commonLen) < minSVSize;
    }

    public static class TraversalEndpointComparator implements Comparator {
        @Override
        public int compare( final Traversal traversal1, final Traversal traversal2 ) {
            final List contigs1 = traversal1.getContigs();
            final List contigs2 = traversal2.getContigs();
            int cmp = Integer.compare(contigs1.get(0).getId(), contigs2.get(0).getId());
            if ( cmp != 0 ) {
                return cmp;
            }
            final int last1 = contigs1.size() - 1;
            final int last2 = contigs2.size() - 1;
            cmp = Integer.compare(contigs1.get(last1).getId(), contigs2.get(last2).getId());
            if ( cmp != 0 ) {
                return cmp;
            }
            // among those starting and ending at the same place, sort least observed last
            cmp = -Integer.compare(traversal1.getMinMaxObservations(),
                                    traversal2.getMinMaxObservations());
            if ( cmp != 0 ) {
                return cmp;
            }
            final int end = Math.min(last1, last2);
            for ( int idx = 1; idx < end; ++idx ) {
                cmp = Integer.compare(contigs1.get(idx).getId(), contigs2.get(idx).getId());
                if ( cmp != 0 ) {
                    return cmp;
                }
            }
            return Integer.compare(last1, last2);
        }
    }

    private static void writeGFA( final GATKPath gfaFile,
                                  final Collection contigs,
                                  final Collection traversals ) {
        for ( final ContigImpl contig : contigs ) {
            contig.setMarked(false);
        }

        try ( final BufferedWriter writer = createBufferedWriter(gfaFile) ) {
            writer.write("H\tVN:Z:2.0");
            writer.newLine();
            for ( final Contig contig : contigs ) {
                if ( !contig.isMarked() ) {
                    writeContig(contig, writer);
                }
            }
            for ( final Traversal traversal : traversals ) {
                writer.write(traversal.getContigs().stream()
                        .map(Contig::toRef)
                        .collect(Collectors.joining(" ", "O\t*\t", "")));
                writer.newLine();
            }
        } catch ( final IOException ioe ) {
            throw new UserException("Failed to write gfa-file " + gfaFile, ioe);
        }
    }

    private static void writeContig( final Contig contig,
                                     final BufferedWriter writer ) throws IOException {
        final Contig canonicalContig = contig.canonical();
        canonicalContig.setMarked(true);
        final CharSequence seq = canonicalContig.getSequence();
        writer.write("S\t" + canonicalContig + "\t" + seq.length() + "\t" + seq +
                "\tMO:i:" + canonicalContig.getMaxObservations() +
                "\tFO:i:" + canonicalContig.getFirstKmer().getNObservations() +
                "\tLO:i:" + canonicalContig.getLastKmer().getNObservations());
        writer.newLine();

        for ( final Contig successor : contig.getSuccessors() ) {
            if ( !successor.isMarked() ) {
                writeContig(successor, writer);
            }
            writeEdge(contig, successor, writer);
        }
        for ( final Contig predecessor : contig.getPredecessors() ) {
            if ( !predecessor.isMarked() ) {
                writeContig(predecessor, writer);
            }
        }
    }

    private static void writeEdge( final Contig contig,
                                   final Contig successor,
                                   final BufferedWriter writer ) throws IOException {
        final int contigLength = contig.getSequence().length();
        writer.write("E\t*\t" + contig.toRef() + "\t" + successor.toRef() + "\t" +
                (contigLength - Kmer.KSIZE + 1) + "\t" + contigLength + "$\t0\t" +
                (Kmer.KSIZE - 1) + "\t" + (Kmer.KSIZE - 1) + "M");
        writer.newLine();
    }

    private static void writeTraversals( final GATKPath fastaFile,
                                         final String assemblyName,
                                         final Collection traversals ) {
        try ( final BufferedWriter writer = createBufferedWriter(fastaFile) ) {
            int traversalNo = 0;
            for ( final Traversal traversal : traversals ) {
                writer.write(">");
                if ( assemblyName != null ) {
                    writer.write(assemblyName);
                    writer.write("_");
                }
                writer.write("t");
                writer.write(Integer.toString(++traversalNo));
                writer.write(" ");
                writer.write(traversal.toString());
                writer.newLine();
                writer.write(traversal.getSequence());
                writer.newLine();
            }
        } catch ( final IOException ioe ) {
            throw new UserException("Failed to write fasta-file " + fastaFile, ioe);
        }
    }

    private static BufferedWriter createBufferedWriter( final GATKPath path ) throws IOException {
        final OutputStream os = path.getOutputStream();
        final String pathString = path.getRawInputString();
        if ( !pathString.endsWith(".gz") && !pathString.endsWith(".GZ") ) {
            return new BufferedWriter(new OutputStreamWriter(os));
        }
        return new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(os)));
    }

    /** fixed-size, immutable kmer.  usual 2-bit encoding: ACGT->0123.  low order bits are final call. */
    public static class Kmer {
        public static final int KSIZE = 31; // must be odd number less than 32
        public static final long KMASK = (1L << 2*KSIZE) - 1L;
        private final long kVal;

        public Kmer( final long kVal ) { this.kVal = kVal; }

        public long getKVal() { return kVal; }
        public boolean isCanonical() { return isCanonical(kVal); }
        public int getInitialCall() { return (int)(kVal >> (KSIZE*2 - 2)) & 3; }
        public int getFinalCall() { return (int)kVal & 3; }

        public long getPredecessorVal( final int call ) {
            return (kVal >> 2) | ((long)call << (2 * (KSIZE - 1)));
        }
        public long getSuccessorVal( final int call ) { return ((kVal << 2) & KMASK) | call; }

        public static boolean isCanonical( final long val ) {
            return (val & (1L << KSIZE)) == 0L;
        }

        @Override public boolean equals( final Object obj ) {
            return obj instanceof Kmer && kVal == ((Kmer)obj).kVal;
        }

        @Override public int hashCode() {
            return (int)(kVal ^ (kVal >>> 32));
        }
    }

    /** Set of Kmers.  Uses HopscotchSet, customized to find correct starting bin for Kmers and derivatives. */
    public static final class KmerSet extends HopscotchSet {
        public KmerSet( final int capacity ) { super(capacity); }

        @Override
        protected int hashToIndex( final Object kmer ) {
            final long positiveKval =
                    ((HopscotchSet.SPREADER * ((Kmer)kmer).getKVal()) & Long.MAX_VALUE);
            return (int)(positiveKval % capacity());
        }
    }

    /**
     *  A Kmer that remembers its predecessors and successors, and the number of times it's been observed
     *  in the assembly's input set of reads.
     *  The masks are bit-wise (1=A, 2=C, 4=G, 8=T) to show which predecessors or successors have been observed.
     *  The Kmer's position on a Contig is also tracked (in later phases of the assembly process).
     */
    public static abstract class KmerAdjacency extends Kmer {
        public KmerAdjacency( final long kVal ) { super(kVal); }

        public abstract KmerAdjacency getSolePredecessor(); // returns null if there's 0 or >1 predecessors
        public abstract int getPredecessorMask();
        public abstract int getPredecessorCount();
        public abstract void removePredecessor( final int callToRemove,
                                                final KmerSet kmerAdjacencySet );

        public abstract KmerAdjacency getSoleSuccessor(); // returns null if there's 0 or > 1 successors
        public abstract int getSuccessorMask();
        public abstract int getSuccessorCount();
        public abstract void removeSuccessor( final int callToRemove,
                                              final KmerSet kmerAdjacencySet );

        public abstract Contig getContig();
        public abstract int getContigOffset();
        // offset is 0-based measure on the contig sequence of the beginning of the kmer
        public abstract void setContigOffset( final Contig contig, final int contigOffset );
        public abstract void clearContig();

        public abstract int getNObservations();
        public abstract KmerAdjacency rc();
        public abstract KmerAdjacencyImpl canonical();

        public void observe( final KmerAdjacency predecessor, final KmerAdjacency successor ) {
            observe(predecessor, successor, 1);
        }

        public abstract void observe( final KmerAdjacency predecessor,
                                      final KmerAdjacency successor,
                                      final int count );

        @Override public String toString() {
            final StringBuilder sb = new StringBuilder(KSIZE);
            long currentVal = getKVal();
            for ( int idx = 0; idx != KSIZE; ++idx ) {
                sb.append("ACGT".charAt((int)currentVal & 3));
                currentVal >>= 2;
            }
            sb.reverse(); // low order bits were loaded into sb first:  fix that now by reversing the sb.
            return sb.toString();
        }

        /**
         * Transform a read's calls into KmerAdjacencies, and add them to a KmerSet.
         * Skip kmers that include a call with a quality < qMin.
         * Skip kmers with non-ACGT calls.
         */
        public static void kmerize( final byte[] calls,
                                    final byte[] quals,
                                    final byte qMin,
                                    final KmerSet kmerAdjacencySet ) {
            int currentCount = 0; // number of calls loaded into currentKVal
            long currentKVal = 0;
            KmerAdjacency prevAdjacency = null;
            KmerAdjacency currentAdjacency = null;
            for ( int idx = 0; idx < calls.length; ++idx ) {
                if ( quals[idx] < qMin ) { // if we encounter a low-quality call
                    // take care of the most recent valid KmerAdjacency, if any
                    if ( currentAdjacency != null ) {
                        currentAdjacency.observe(prevAdjacency, null);
                    }
                    // ready ourselves to accumulate calls afresh
                    currentCount = 0;
                    currentAdjacency = prevAdjacency = null;
                    continue;
                }
                currentKVal <<= 2;
                switch ( calls[idx] ) {
                    case 'A': case 'a': break;
                    case 'C': case 'c': currentKVal += 1; break;
                    case 'G': case 'g': currentKVal += 2; break;
                    case 'T': case 't': currentKVal += 3; break;
                    default:
                        if ( currentAdjacency != null ) {
                            currentAdjacency.observe(prevAdjacency, null);
                        }
                        currentCount = 0;
                        currentAdjacency = prevAdjacency = null;
                        continue;
                }
                if ( ++currentCount >= KSIZE ) { // if we've loaded enough calls to make a complete kmer
                    final KmerAdjacency nextAdjacency = findOrAdd(currentKVal, kmerAdjacencySet);
                    if ( currentAdjacency != null ) {
                        currentAdjacency.observe(prevAdjacency, nextAdjacency);
                    }
                    prevAdjacency = currentAdjacency;
                    currentAdjacency = nextAdjacency;
                }
            }
            if ( currentAdjacency != null ) {
                currentAdjacency.observe(prevAdjacency, null);
            }
        }

        /**
         * Kmerize a String.  This version is for gap fills.
         * The number of observations applies to all kmers except the 1st and last.
         */
        public static void kmerize( final String sequence,
                                    final int nObservations,
                                    final KmerSet kmerAdjacencySet ) {
            int currentCount = 0;
            long currentKVal = 0;
            int nObs = 0;
            KmerAdjacency prevAdjacency = null;
            KmerAdjacency currentAdjacency = null;
            final int nCalls = sequence.length();
            for ( int idx = 0; idx != nCalls; ++idx ) {
                currentKVal <<= 2;
                switch ( sequence.charAt(idx) ) {
                    case 'A': case 'a': break;
                    case 'C': case 'c': currentKVal += 1; break;
                    case 'G': case 'g': currentKVal += 2; break;
                    case 'T': case 't': currentKVal += 3; break;
                    default: throw new GATKException("unexpected base call in string to kmerize.");
                }
                if ( ++currentCount >= KSIZE ) {
                    final KmerAdjacency nextAdjacency = findOrAdd(currentKVal, kmerAdjacencySet);
                    if ( currentAdjacency != null ) {
                        currentAdjacency.observe(prevAdjacency, nextAdjacency, nObs);
                        nObs = nObservations;
                    }
                    prevAdjacency = currentAdjacency;
                    currentAdjacency = nextAdjacency;
                }
            }
            if ( currentAdjacency != null ) {
                currentAdjacency.observe(prevAdjacency, null, 0);
            }
        }

        // Lookup table for reverse-complementing each possible byte value.
        // Each pair of bits represents a base, so you have to reverse bits pairwise and then invert all bits.
        // This is most quickly and easily done with a lookup table.
        private static final long[] BYTEWISE_REVERSE_COMPLEMENT;
        static {
            BYTEWISE_REVERSE_COMPLEMENT = new long[256];
            for ( int bIn = 0; bIn != 256; ++bIn ) {
                BYTEWISE_REVERSE_COMPLEMENT[bIn] =
                        ~(((bIn & 3) << 6) | (((bIn >> 2) & 3) << 4) |
                                (((bIn >> 4) & 3) << 2) | ((bIn >> 6) & 3)) & 0xffL;
            }
        }

        public static long reverseComplement( long val ) {
            // process val one byte at a time
            long result = BYTEWISE_REVERSE_COMPLEMENT[(int)val & 0xFF]; // handle the low-order byte
            int nBytes = 8;
            while ( --nBytes != 0 ) { // pre-decrementing:  we'll go through the loop 7 times
                // rotate down by a byte
                val >>= 8;
                // rotate up by a byte and OR in the reverse complement of the next byte
                result = (result << 8) | BYTEWISE_REVERSE_COMPLEMENT[(int)val & 0xFF];
            }
            return result >>> (Long.SIZE - 2*KSIZE);
        }

        // Kmer lookup in KmerSet.
        // KmerSets holding KmerAdjacencies have only canonical Kmers, so RC non-canonical kmers before lookup.
        public static KmerAdjacency find( final long kVal,
                                          final KmerSet kmerAdjacencySet ) {
            if ( isCanonical(kVal) ) return kmerAdjacencySet.find(new Kmer(kVal & KMASK));
            final KmerAdjacency result = kmerAdjacencySet.find(new Kmer(reverseComplement(kVal)));
            return result == null ? null : result.rc();
        }

        // Kmer lookup in KmerSet.
        // KmerSets holding KmerAdjacencies have only canonical Kmers, so RC non-canonical kmers before lookup.
        // Add missing Kmers.
        public static KmerAdjacency findOrAdd( final long kVal,
                                               final KmerSet kmerAdjacencySet ) {
            if ( isCanonical(kVal) ) {
                return kmerAdjacencySet.findOrAdd(new Kmer(kVal & KMASK), kmer ->
                        new KmerAdjacencyImpl(((Kmer)kmer).getKVal()));
            }
            return kmerAdjacencySet.findOrAdd(new Kmer(reverseComplement(kVal)), kmer ->
                    new KmerAdjacencyImpl(((Kmer)kmer).getKVal())).rc();
        }
    }

    /**
     * Class to implement KmerAdjacency for canonical Kmers.
     * In particular, a KmerSet created on KmerAdjacency contains only canonical Kmers.
     */
    public static final class KmerAdjacencyImpl extends KmerAdjacency {
        private final KmerAdjacencyRC rc; // the reverse-complement of this kmer

        // these fields won't change much after all reads have been kmerized, but they do change
        // during that process.  and maybe just a little during gap filling
        private KmerAdjacency solePredecessor; // set to null if there are no predecessors, or multiple predecessors
        private KmerAdjacency soleSuccessor; // set to null if there are no successors, or multiple successors
        private int predecessorMask; // bit mask of observed kmers preceding this one
        private int successorMask; // bit mask observed kmers following this one
        private int nObservations; // the reads in which this kmer was observed

        // these fields refer back to the graph:  they'll change as the graph structure is refined
        private Contig contig; // the contig that contains this Kmer
        private int contigOffset; // the offset within the contig where this kmer is found

        private static final int[] COUNT_FOR_MASK =
                //side sum for binary values from 0 -> 15
                //0000  0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
                {    0,    1,   1,   2,   1,   2,   2,   3,   1,   2,   2,   3,   2,   3,   3,   4 };

        public KmerAdjacencyImpl( final long kVal ) {
            super(kVal);
            this.rc = new KmerAdjacencyRC(this);
        }

        @Override public KmerAdjacency getSolePredecessor() { return solePredecessor; } // may return null
        @Override public int getPredecessorMask() { return predecessorMask; }
        @Override public int getPredecessorCount() { return COUNT_FOR_MASK[predecessorMask]; }
        @Override
        public void removePredecessor( final int callToRemove,
                                       final KmerSet kmerAdjacencySet ) {
            predecessorMask &= ~(1 << callToRemove);
            solePredecessor = null;
            if ( getPredecessorCount() == 1 ) {
                for ( int call = 0; call != 4; ++call ) {
                    if ( ((1 << call) & predecessorMask) != 0 ) {
                        solePredecessor = find(getPredecessorVal(call), kmerAdjacencySet);
                        break;
                    }
                }
            }
        }

        @Override public KmerAdjacency getSoleSuccessor() { return soleSuccessor; } // may return null
        @Override public int getSuccessorMask() { return successorMask; }
        @Override public int getSuccessorCount() { return COUNT_FOR_MASK[successorMask]; }
        @Override
        public void removeSuccessor( final int callToRemove,
                                     final KmerSet kmerAdjacencySet ) {
            successorMask &= ~(1 << callToRemove);
            soleSuccessor = null;
            if ( getSuccessorCount() == 1 ) {
                for ( int call = 0; call != 4; ++call ) {
                    if ( ((1 << call) & successorMask) != 0 ) {
                        soleSuccessor = find(getSuccessorVal(call), kmerAdjacencySet);
                        break;
                    }
                }
            }
        }

        @Override public Contig getContig() { return contig; }
        @Override public int getContigOffset() { return contigOffset; }
        @Override public void setContigOffset( final Contig contig, final int contigOffset ) {
            if ( this.contig != null ) {
                throw new GATKException("Internal error: overwriting kmer contig and offset.");
            }
            this.contig = contig;
            this.contigOffset = contigOffset;
        }
        @Override public void clearContig() { contig = null; contigOffset = 0; }

        @Override public int getNObservations() { return nObservations; }
        @Override public KmerAdjacency rc() { return rc; }
        @Override public KmerAdjacencyImpl canonical() { return this; }

        @Override public void observe( final KmerAdjacency predecessor,
                                       final KmerAdjacency successor,
                                       final int count ) {
            if ( predecessor != null ) {
                if ( predecessor.getSuccessorVal(getFinalCall()) != getKVal() ) {
                    throw new GATKException("illegal predecessor");
                }
                final int initialCall = predecessor.getInitialCall();
                final int newPredecessorMask = 1 << initialCall;
                if ( (newPredecessorMask & predecessorMask) == 0 ) {
                    if ( predecessorMask == 0 ) {
                        solePredecessor = predecessor;
                        predecessorMask = newPredecessorMask;
                    } else {
                        solePredecessor = null;
                        predecessorMask |= newPredecessorMask;
                    }
                }
            }
            if ( successor != null ) {
                if ( successor.getPredecessorVal(getInitialCall()) != getKVal() ) {
                    throw new GATKException("illegal successor");
                }
                final int finalCall = successor.getFinalCall();
                final int newSuccessorMask = 1 << finalCall;
                if ( (newSuccessorMask & successorMask) == 0 ) {
                    if ( successorMask == 0 ) {
                        soleSuccessor = successor;
                        successorMask = newSuccessorMask;
                    } else {
                        soleSuccessor = null;
                        successorMask |= newSuccessorMask;
                    }
                }
            }
            nObservations += count;
        }
    }

    /**
     * Class to implement KmerAdjacency for Kmers that are the reverse-complement of a canonical Kmer.
     * In particular, a KmerSet created on KmerAdjacency contains only canonical Kmers.
     * A KmerAdjacencyRC represents the RC of each Kmer in the KmerSet.
     */
    public static final class KmerAdjacencyRC extends KmerAdjacency {
        private final KmerAdjacencyImpl rc;

        // lookup table to bit-reverse nibbles
        private static final int[] NIBREV =
                // 0000,  0001,  0010,  0011,  0100,  0101,  0110,  0111,  1000,  1001,  1010,  1011,  1100,  1101,  1110,  1111
                {0b0000,0b1000,0b0100,0b1100,0b0010,0b1010,0b0110,0b1110,0b0001,0b1001,0b0101,0b1101,0b0011,0b1011,0b0111,0b1111};

        public KmerAdjacencyRC( final KmerAdjacencyImpl rc ) {
            super(reverseComplement(rc.getKVal()));
            this.rc = rc;
        }

        @Override public KmerAdjacency getSolePredecessor() {
            final KmerAdjacency successor = rc.getSoleSuccessor();
            return successor == null ? null : successor.rc();
        }
        @Override public int getPredecessorMask() { return NIBREV[rc.getSuccessorMask()]; }
        @Override public int getPredecessorCount() { return rc.getSuccessorCount(); }
        @Override
        public void removePredecessor( final int callToRemove,
                                       final KmerSet kmerAdjacencySet ) {
            rc.removeSuccessor(3 - callToRemove, kmerAdjacencySet);
        }

        @Override public KmerAdjacency getSoleSuccessor() {
            final KmerAdjacency predecessor = rc.getSolePredecessor();
            return predecessor == null ? null : predecessor.rc();
        }
        @Override public int getSuccessorMask() { return NIBREV[rc.getPredecessorMask()]; }
        @Override public int getSuccessorCount() { return rc.getPredecessorCount(); }
        @Override
        public void removeSuccessor( final int callToRemove,
                                     final KmerSet kmerAdjacencySet ) {
            rc.removePredecessor(3 - callToRemove, kmerAdjacencySet);
        }

        @Override public Contig getContig() {
            final Contig contig = rc.getContig();
            return contig == null ? null : contig.rc();
        }
        @Override public int getContigOffset() {
            final Contig contig = rc.getContig();
            return contig == null ? 0 : contig.size() - rc.getContigOffset() - KSIZE;
        }
        @Override public void setContigOffset( final Contig contig, final int contigOffset ) {
            rc.setContigOffset(contig.rc(), contig.size() - contigOffset - KSIZE);
        }
        @Override public void clearContig() { rc.clearContig(); }

        @Override public int getNObservations() { return rc.getNObservations(); }
        @Override public KmerAdjacency rc() { return rc; }
        @Override public KmerAdjacencyImpl canonical() { return rc; }

        @Override public void observe( final KmerAdjacency predecessor,
                                       final KmerAdjacency successor,
                                       final int count ) {
            rc.observe(successor == null ? null : successor.rc(),
                    predecessor == null ? null : predecessor.rc(),
                    count);
        }
    }

    public enum ContigOrientation {
        FWD, // k-mer appears at the 5' end of the contig
        REV, // k-mer appears at the 5' end of the reverse-complemented contig
        BOTH // k-mer occurs on 5' end of the contig and its RC (can happen when the contig is a palindrome)
    }

    /** Initial or final Kmer in a Contig. */
    public static final class ContigEndKmer extends Kmer {
        private final Contig contig;
        private final ContigOrientation contigOrientation;

        public ContigEndKmer( final long kVal,
                              final Contig contig,
                              final ContigOrientation contigEnd ) {
            super(kVal);
            this.contig = contig;
            this.contigOrientation = contigEnd;
        }

        public Contig getContig() { return contig; }
        public ContigOrientation getContigOrientation() { return contigOrientation; }
    }

    /**
     * An unbranched sequence of Kmers.
     * Each Kmer (except the last one) has a single successor, which allows enumerating the sequence
     * of Kmers in the Contig.  The sequence of base calls in the Contig is just the sequence of kmers
     * with the K-1 overlapping calls removed from adjacent kmers.
     */
    public interface Contig {
        int getId();
        String toRef(); // a GFA-format reference
        CharSequence getSequence();
        int getMaxObservations();
        KmerAdjacency getFirstKmer();
        KmerAdjacency getLastKmer();
        List getPredecessors();
        List getSuccessors();
        int size();
        default int getNKmers() { return size() - Kmer.KSIZE + 1; }
        Contig rc();
        boolean isCycleMember();
        void setIsCycleMember( final boolean isCycleMember );
        boolean isMarked();
        void setMarked( final boolean marked );
        boolean isCanonical();
        ContigImpl canonical();

        default boolean isPredecessor( final Contig contig ) {
            return findContig(getPredecessors(), contig);
        }
        default boolean isSuccessor( final Contig contig ) {
            return findContig(getSuccessors(), contig);
        }

        static boolean findContig( final List contigs, final Contig contig ) {
            for ( final Contig tig : contigs ) {
                if ( contig == tig ) {
                    return true;
                }
            }
            return false;
        }
    }

    /** Simple implementation of Contig interface. */
    public static final class ContigImpl implements Contig {
        private final int id;
        private final CharSequence sequence;
        private final int maxObservations;
        private final KmerAdjacency firstKmer;
        private final KmerAdjacency lastKmer;
        private final List predecessors;
        private final List successors;
        private boolean cyclic;
        private boolean marked;
        private final Contig rc;

        public ContigImpl( final int id, final KmerAdjacency firstKmerAdjacency ) {
            this.id = id;
            final StringBuilder sb = new StringBuilder(firstKmerAdjacency.toString());
            int maxObservations = firstKmerAdjacency.getNObservations();
            KmerAdjacency lastKmerAdjacency = firstKmerAdjacency;
            for ( KmerAdjacency kmerAdjacency = firstKmerAdjacency.getSoleSuccessor();
                  kmerAdjacency != null;
                  kmerAdjacency = kmerAdjacency.getSoleSuccessor() ) {
                // if we've gone around a circle, or if we're branching backwards, or if we hit a palindrome u-turn
                if ( firstKmerAdjacency == kmerAdjacency ||
                        kmerAdjacency.getPredecessorCount() != 1 ||
                        kmerAdjacency == lastKmerAdjacency.rc() ) {
                    break;
                }
                sb.append("ACGT".charAt(kmerAdjacency.getFinalCall()));
                maxObservations = Math.max(maxObservations, kmerAdjacency.getNObservations());
                lastKmerAdjacency = kmerAdjacency;
            }
            this.sequence = sb.toString();
            this.maxObservations = maxObservations;
            this.firstKmer = firstKmerAdjacency;
            this.lastKmer = lastKmerAdjacency;
            this.predecessors = new ArrayList<>(firstKmer.getPredecessorCount());
            this.successors = new ArrayList<>(lastKmer.getSuccessorCount());
            this.rc = new ContigRCImpl(this);
        }

        // create a new contig by joining two contigs
        public ContigImpl( final int id, final Contig predecessor, final Contig successor ) {
            if ( predecessor == successor || predecessor == successor.rc() ) {
                throw new GATKException("can't self-join");
            }
            if ( !checkOverlap(predecessor.getSequence(), successor.getSequence()) ) {
                throw new GATKException("sequences can't be joined");
            }
            this.id = id;
            final StringBuilder sb = new StringBuilder(predecessor.getSequence());
            final CharSequence successorSequence = successor.getSequence();
            sb.append(successorSequence.subSequence(Kmer.KSIZE - 1, successorSequence.length()));
            this.sequence = sb.toString();
            this.maxObservations =
                    Math.max(predecessor.getMaxObservations(), successor.getMaxObservations());
            this.firstKmer = predecessor.getFirstKmer();
            this.lastKmer = successor.getLastKmer();
            this.predecessors = new ArrayList<>(predecessor.getPredecessors().size());
            this.successors = new ArrayList<>(successor.getSuccessors().size());
            this.rc = new ContigRCImpl(this);

            // fix predecessor linkages to point to new contig
            for ( final Contig predPredecessor : predecessor.getPredecessors() ) {
                if ( predPredecessor == successor ) {
                    predecessors.add(this);
                } else if ( predPredecessor == predecessor.rc() ) {
                    predecessors.add(rc);
                } else {
                    predecessors.add(predPredecessor);
                    final List successors = predPredecessor.getSuccessors();
                    successors.set(successors.indexOf(predecessor), this);
                }
            }

            // fix successor linkages to point to new contig
            for ( final Contig succSuccessor : successor.getSuccessors() ) {
                if ( succSuccessor == predecessor ) {
                    successors.add(this);
                } else if ( succSuccessor == successor.rc() ) {
                    successors.add(rc);
                } else {
                    successors.add(succSuccessor);
                    final List predecessors = succSuccessor.getPredecessors();
                    predecessors.set(predecessors.indexOf(successor), this);
                }
            }
        }

        @Override
        public int getId() {
            return id;
        }

        @Override
        public String toRef() { return toString() + "+"; }

        @Override
        public CharSequence getSequence() {
            return sequence;
        }

        @Override
        public int getMaxObservations() {
            return maxObservations;
        }

        @Override
        public KmerAdjacency getFirstKmer() {
            return firstKmer;
        }

        @Override
        public KmerAdjacency getLastKmer() {
            return lastKmer;
        }

        @Override
        public List getPredecessors() {
            return predecessors;
        }

        @Override
        public List getSuccessors() {
            return successors;
        }

        @Override
        public int size() {
            return sequence.length();
        }

        @Override
        public Contig rc() {
            return rc;
        }

        @Override
        public boolean isCycleMember() {
            return cyclic;
        }

        @Override
        public void setIsCycleMember( final boolean isCycleMember ) {
            this.cyclic = isCycleMember;
        }

        @Override
        public boolean isMarked() {
            return marked;
        }

        @Override
        public void setMarked( final boolean marked ) {
            this.marked = marked;
        }

        @Override
        public boolean isCanonical() {
            return true;
        }

        @Override
        public ContigImpl canonical() {
            return this;
        }

        @Override
        public String toString() {
            return "c" + id;
        }

        private static boolean checkOverlap( final CharSequence seq1, final CharSequence seq2 ) {
            final int seq1Len = seq1.length();
            final CharSequence seq1SubSeq = seq1.subSequence(seq1Len - Kmer.KSIZE + 1, seq1Len);
            final CharSequence seq2SubSeq = seq2.subSequence(0, Kmer.KSIZE - 1);
            return seq1SubSeq.equals(seq2SubSeq);
        }
    }

    /**
     * Implementation of Contig for the reverse-complement of some other Contig.
     * Which one is the "real" Contig, and which is the "RC" is completely arbitrary, since there
     * is no notion of canonical for Contigs.
     */
    public static final class ContigRCImpl implements Contig {
        private final CharSequence sequence;
        private final List predecessors;
        private final List successors;
        private final ContigImpl rc;

        public ContigRCImpl( final ContigImpl contig ) {
            this.sequence = new SequenceRC(contig.getSequence());
            this.predecessors = new ContigListRC(contig.getSuccessors());
            this.successors = new ContigListRC(contig.getPredecessors());
            this.rc = contig;
        }

        @Override public int getId() { return ~rc.getId(); }
        @Override public String toRef() { return rc.toString() + "-"; }
        @Override public CharSequence getSequence() { return sequence; }
        @Override public int getMaxObservations() { return rc.getMaxObservations(); }
        @Override public KmerAdjacency getFirstKmer() { return rc.getLastKmer().rc(); }
        @Override public KmerAdjacency getLastKmer() { return rc.getFirstKmer().rc(); }
        @Override public List getPredecessors() { return predecessors; }
        @Override public List getSuccessors() { return successors; }
        @Override public int size() { return sequence.length(); }
        @Override public Contig rc() { return rc; }
        @Override public boolean isCycleMember() { return rc.isCycleMember(); }
        @Override public void setIsCycleMember( final boolean isCycleMember ) { rc.setIsCycleMember(isCycleMember); }
        @Override public boolean isMarked() { return rc.isMarked(); }
        @Override public void setMarked( final boolean marked ) { rc.setMarked(marked); }
        @Override public boolean isCanonical() { return false; }
        @Override public ContigImpl canonical() { return rc; }
        @Override public String toString() { return rc.toString() + "RC"; }
    }

    /** A CharSequence that is a view of the reverse-complement of another sequence. */
    public static final class SequenceRC implements CharSequence, Comparable {
        private final int lenLess1;
        private final CharSequence sequence;

        public SequenceRC( final CharSequence sequence ) {
            this.lenLess1 = sequence.length() - 1;
            this.sequence = sequence;
        }

        @Override public int length() { return sequence.length(); }
        @Override public char charAt( final int index ) {
            final char result;
            switch ( Character.toUpperCase(sequence.charAt(lenLess1 - index)) ) {
                case 'A': result = 'T'; break;
                case 'C': result = 'G'; break;
                case 'G': result = 'C'; break;
                case 'T': result = 'A'; break;
                default: result = 'N'; break;
            }
            return result;
        }
        @Override public CharSequence subSequence( final int start, final int end ) {
            return new StringBuilder(end - start).append(this, start, end).toString();
        }
        @Override public String toString() { return new StringBuilder(this).toString(); }

        @Override public int compareTo( final CharSequence charSequence ) {
            final int len1 = length();
            final int len2 = charSequence.length();
            final int cmpLen = Math.min(len1, len2);
            for ( int idx = 0; idx != cmpLen; ++idx ) {
                final char char1 = charAt(idx);
                final char char2 = Character.toUpperCase(charSequence.charAt(idx));
                if ( char1 > char2 ) return 1;
                if ( char1 < char2 ) return -1;
            }
            return Integer.compare(len1, len2);
        }
    }

    /** A list of Contigs that presents a reverse-complemented view of a List of Contigs. */
    public static final class ContigListRC extends AbstractList {
        private final List contigList;

        public ContigListRC( final List contigList ) {
            this.contigList = contigList;
        }

        @Override public Contig get( final int index ) {
            return contigList.get(reflectIndex(index)).rc();
        }
        @Override public int size() { return contigList.size(); }
        @Override public Contig set( final int index, final Contig contig ) {
            return contigList.set(reflectIndex(index), contig.rc()).rc();
        }
        @Override public void add( final int index, final Contig contig ) {
            contigList.add(reflectIndex(index), contig.rc());
        }
        @Override public Contig remove( final int index ) {
            return contigList.remove(reflectIndex(index)).rc();
        }

        public List rc() { return contigList; }

        private int reflectIndex( final int index ) { return size() - 1 - index; }
    }

    /** A single-Contig portion of a path across the assembly graph. */
    public interface PathPart {
        Contig getContig(); // will be null for PathParts that depart from the graph (PathPartGap)
        CharSequence getSequence(); // will be null for PathParts on the graph (PathPartContig)
        void extend( final char call );
        int getStart();
        int getStop();
        boolean isGap();
        int getLength();
        PathPart rc();
        char getFirstCall();
        char getLastCall();
        default boolean startsAtBeginning() { return getStart() == 0; }
        default boolean stopsAtEnd() { return getStop() + Kmer.KSIZE - 1 == getContig().size(); }
    }

    /** A part of a path that isn't present in the graph. */
    public static final class PathPartGap implements PathPart {
        private final StringBuilder sequence = new StringBuilder();

        public PathPartGap( final KmerAdjacency kmer ) { sequence.append(kmer.toString()); }
        private PathPartGap( final CharSequence sequence ) { this.sequence.append(sequence); }

        @Override public Contig getContig() { return null; }
        @Override public CharSequence getSequence() { return sequence.toString(); }
        @Override public void extend( final char call ) { sequence.append(call); }
        @Override public int getStart() { return 0; }
        @Override public int getStop() { return sequence.length(); }
        @Override public boolean isGap() { return true; }
        @Override public int getLength() { return sequence.length() - Kmer.KSIZE + 1; }
        @Override public PathPart rc() { return new PathPartGap(new SequenceRC(sequence)); }
        @Override public char getFirstCall() { return sequence.charAt(Kmer.KSIZE - 1); }
        @Override public char getLastCall() {
            return sequence.charAt(sequence.length() - Kmer.KSIZE + 1);
        }
    }

    /** A part of a path that is present as a sub-sequence of some Contig. */
    public static final class PathPartContig implements PathPart {
        private final Contig contig;
        private final int start;
        private int stop;

        public PathPartContig( final Contig contig, final int start ) {
            this(contig, start, start+1);
        }
        public PathPartContig( final Contig contig, final int start, final int stop ) {
            this.contig = contig;
            this.start = start;
            this.stop = stop;
        }

        @Override public Contig getContig() { return contig; }
        @Override public String getSequence() { return null; }
        @Override public void extend( final char call ) { stop += 1; }
        @Override public int getStart() { return start; }
        @Override public int getStop() { return stop; }
        @Override public boolean isGap() { return false; }
        @Override public int getLength() { return stop - start; }
        @Override public PathPart rc() {
            final int revBase = contig.size() - Kmer.KSIZE + 1;
            return new PathPartContig(contig.rc(), revBase - stop, revBase - start);
        }
        @Override public char getFirstCall() {
            return getContig().getSequence().charAt(start + Kmer.KSIZE - 1);
        }
        @Override public char getLastCall() { return getContig().getSequence().charAt(stop - 1); }
    }

    /** A helper class for Path building.
     * It transforms an array of base calls into a list of contigs (and gaps).
     */
    public static final class PathBuilder {
        private final KmerSet kmerAdjacencySet;
        private final List parts;
        private long kVal = 0;
        private int count = 0;
        private PathPart currentPathPart = null;

        public PathBuilder( final KmerSet kmerAdjacencySet ) {
            this.kmerAdjacencySet = kmerAdjacencySet;
            this.parts = new ArrayList<>();
        }

        public List processCalls( final byte[] calls ) {
            parts.clear();
            kVal = 0;
            count = 0;
            currentPathPart = null;

            for ( int idx = 0; idx != calls.length; ++idx ) {
                processCall( (char)calls[idx] );
            }
            return new ArrayList<>(parts);
        }

        private void processCall( final char call ) {
            kVal <<= 2;
            switch ( call ) {
                case 'C': case 'c': kVal += 1; break;
                case 'G': case 'g': kVal += 2; break;
                case 'T': case 't': kVal += 3; break;
            }
            if ( ++count >= Kmer.KSIZE ) { // if we've seen enough calls to make a whole kmer
                processKval(call);
            }
        }

        private void processKval( final char call ) {
            final KmerAdjacency kmer = KmerAdjacencyImpl.find(kVal, kmerAdjacencySet);
            if ( kmer == null ) { // if the kmer isn't in the KmerSet
                processGap(call); // we'll make a "gap" PathPart
            } else {
                processKmer(call, kmer); // it's a part of some contig
            }
        }

        private void processGap( final char call ) {
            // if there's no current path part, or if the current part isn't a gap, create a PathPartGap
            if ( currentPathPart == null || !currentPathPart.isGap() ) {
                currentPathPart = new PathPartGap(new KmerAdjacencyImpl(kVal));
                parts.add(currentPathPart);
            } else {
                // the current path part is a PathPartGap, so just extend it
                currentPathPart.extend(call);
            }
        }

        private void processKmer( final char call, final KmerAdjacency kmer ) {
            final Contig contig = kmer.getContig();
            final int contigOffset = kmer.getContigOffset();
            if ( currentPathPart == null ) {
                // we've found a kmer, but don't have a current path part -- create one
                currentPathPart = new PathPartContig(contig, contigOffset);
                parts.add(currentPathPart);
            } else if ( contig == currentPathPart.getContig() ) {
                // our lookup is on the current path part's contig -- extend it
                extendCurrentPathPart(call, contig, contigOffset);
            } else {
                // we're moving from one contig to a new one, or from a gap onto a contig
                processNewContig(contig, contigOffset);
            }
        }

        private void extendCurrentPathPart( final char call,
                                            final Contig contig,
                                            final int contigOffset ) {
            final int curStop = currentPathPart.getStop();
            if ( contigOffset == curStop ) { // if this is just the next kmer on our current contig
                currentPathPart.extend(call);
            } else if ( contigOffset == 0 && contig.getNKmers() == curStop ) {
                // starting over on the same contig (i.e., a tight cycle)
                currentPathPart = new PathPartContig(contig, 0);
                parts.add(currentPathPart);
            } else {
                // kmer is non-contiguous on the current contig.
                // start a new path part after a zero-length gap.
                parts.add(zeroLengthGap(currentPathPart));
                currentPathPart = new PathPartContig(contig, contigOffset);
                parts.add(currentPathPart);
            }
        }

        private void processNewContig( final Contig contig, final int contigOffset ) {
            if ( currentPathPart.isGap() ) {
                // try to squash captured gaps caused by isolated, single-base sequencing errors
                if ( currentPathPart.getLength() == Kmer.KSIZE ) {
                    final int prevPartIdx = parts.size() - 2;
                    if ( prevPartIdx >= 0 ) {
                        if ( gapIsSquashed(prevPartIdx, contig, contigOffset) ) {
                            return;
                        }
                    }
                }
            } else if ( !currentPathPart.stopsAtEnd() || contigOffset != 0 ||
                    !currentPathPart.getContig().isSuccessor(contig) ) {
                // not an end-to-end join across connected contigs -- record a zero-length gap
                parts.add(zeroLengthGap(currentPathPart));
            }

            // we're jumping to a new contig.  start a new path part
            currentPathPart = new PathPartContig(contig, contigOffset);
            parts.add(currentPathPart);
        }

        private boolean gapIsSquashed( final int prevPartIdx,
                                       final Contig contig,
                                       final int contigOffset ) {
            final PathPart prevPart = parts.get(prevPartIdx);
            final Contig prevPartContig = prevPart.getContig();
            final int prevPartStart = prevPart.getStart();
            final int prevPartStop = prevPart.getStop();
            final int prevPartMaxStop =
                    prevPartContig.size() - Kmer.KSIZE + 1;
            final int newStop = contigOffset + 1;

            if ( prevPartContig == contig ) { // if the gap is internal to a single contig
                if ( contigOffset - prevPartStop == Kmer.KSIZE ) {
                    // smooth over a kmer-sized gap in a single contig by backfilling the gap
                    currentPathPart = new PathPartContig(prevPartContig, prevPartStart, newStop);
                    parts.set(prevPartIdx, currentPathPart);
                    parts.remove(prevPartIdx + 1);
                    return true;
                }
            } else if ( prevPartMaxStop - prevPartStop + contigOffset == Kmer.KSIZE ) {
                // kmer-size gap crosses from one contig to another:  backfill the gap
                parts.set(prevPartIdx,
                        new PathPartContig(prevPartContig, prevPartStart, prevPartMaxStop));
                currentPathPart = new PathPartContig(contig, 0, newStop);
                parts.set(prevPartIdx + 1, currentPathPart);
                return true;
            }
            return false;
        }

        private static PathPart zeroLengthGap( final PathPart currentPathPart ) {
            final int currentStop = currentPathPart.getStop();
            final CharSequence currentSequence = currentPathPart.getContig().getSequence();
            final CharSequence almostAKmer =
                    currentSequence.subSequence(currentStop, currentStop + Kmer.KSIZE - 1);
            return new PathPartGap(almostAKmer);
        }
    }

    /** A path through the assembly graph for something (probably a read). */
    public static final class Path {
        private final List parts;

        public Path( final byte[] calls, final PathBuilder pathBuilder ) {
            parts = pathBuilder.processCalls(calls);
        }

        // RCing constructor
        private Path( final Path that ) {
            final List thoseParts = that.parts;
            final int nParts = thoseParts.size();
            parts = new ArrayList<>(nParts);
            for ( int idx = nParts - 1; idx >= 0; --idx ) {
                parts.add(thoseParts.get(idx).rc());
            }
        }

        public List getParts() { return parts; }
        public Path rc() { return new Path(this); }

        @Override public String toString() {
            if ( parts.size() == 0 ) return "";
            final StringBuilder sb = new StringBuilder();
            String prefix = "";
            final PathPart firstPart = parts.get(0);
            final PathPart lastPart = parts.get(parts.size() - 1);
            for ( final PathPart pp : parts ) {
                sb.append(prefix);
                prefix = ", ";
                if ( pp.isGap() ) {
                    sb.append("NoKmer(").append(pp.getLength()).append(")");
                } else {
                    final Contig contig = pp.getContig();
                    sb.append(contig);
                    final int maxStop = contig.size() - Kmer.KSIZE + 1;
                    if ( (pp != firstPart && pp.getStart() != 0) ||
                         (pp != lastPart && pp.getStop() != maxStop) ) {
                        sb.append('(').append(pp.getStart()).append('-')
                                .append(pp.getStop()).append('/').append(maxStop).append(')');
                    }
                }
            }
            return sb.toString();
        }
    }

    public static final class WalkDataFactory {
        private int nextNum;

        public WalkData create() { return new WalkData(++nextNum); }
    }

    /** Per-Contig storage for depth-first graph walking. */
    public static final class WalkData {
        public int visitNum;
        public int minVisitNum;

        private WalkData( final int visitNum ) {
            this.visitNum = visitNum;
            this.minVisitNum = visitNum;
        }
    }

    /**
     * A count of the number of read Paths that cross through some Contig from some previous Contig
     * to some subsequent Contig.  This helps us with phasing, so that we limit ourselves to
     * graph traversals truly represented by the data.
     */
    public static final class TransitPairCount {
        private final Contig prevContig;
        private final Contig nextContig;
        private final TransitPairCount rc;
        private int count;

        public TransitPairCount( final Contig prevContig, final Contig nextContig ) {
            this.prevContig = prevContig;
            this.nextContig = nextContig;
            this.rc = new TransitPairCount(this);
            this.count = 1;
        }

        private TransitPairCount( final TransitPairCount rc ) {
            this.prevContig = rc.nextContig.rc();
            this.nextContig = rc.prevContig.rc();
            this.rc = rc;
            this.count = 1;
        }

        public Contig getPrevContig() { return prevContig; }
        public Contig getNextContig() { return nextContig; }
        public TransitPairCount getRC() { return rc; }
        public void observe() { count += 1; rc.count += 1; }
        public void resetCount() { count = 0; rc.count = 0; }
        public int getCount() { return count; }

        @Override public boolean equals( final Object obj ) {
            if ( !(obj instanceof TransitPairCount) ) return false;
            final TransitPairCount that = (TransitPairCount)obj;
            return this.prevContig == that.prevContig && this.nextContig == that.nextContig;
        }

        @Override public int hashCode() {
            return 47 * (47 * prevContig.hashCode() + nextContig.hashCode());
        }

        @Override public String toString() {
            return prevContig + "<-->" + nextContig + " " + count + "x";
        }
    }

    /**
     * A list of Contigs through the assembly graph.
     * Differs from a Path in that there are no gaps, and all the Contigs are properly adjacent
     * in the graph.
     * In an earlier phase of assembly, all valid phased Traversals are produced.
     * Later, the same Traversal class is used to hook up Traversals across which we cannot phase.
     * (These are somewhat improperly called Scaffolds.)
     */
    public static final class Traversal {
        private final List contigs;
        private final int minMaxObservations;
        private int hashCode; // Traversal is immutable, so we can stash the hashCode

        public Traversal( final Collection contigs ) {
            if ( contigs == null || contigs.isEmpty() ) {
                throw new GATKException("null or empty list of contigs in traversal");
            }
            this.contigs = new ArrayList<>(contigs);
            int minMaxObservations = Integer.MAX_VALUE;
            for ( Contig contig : contigs ) {
                minMaxObservations = Math.min(minMaxObservations, contig.getMaxObservations());
            }
            this.minMaxObservations = minMaxObservations;
            this.hashCode = 0;
        }

        // RC constructor
        private Traversal( final Traversal traversal ) {
            final List thoseContigs = traversal.contigs;
            this.contigs = thoseContigs instanceof ContigListRC ?
                            ((ContigListRC)thoseContigs).rc() : new ContigListRC(thoseContigs);
            this.minMaxObservations = traversal.minMaxObservations;
            this.hashCode = 0;
        }

        public List getContigs() { return Collections.unmodifiableList(contigs); }
        public Contig getFirstContig() { return contigs.get(0); }
        public Contig getLastContig() { return contigs.get(contigs.size() - 1); }
        public Traversal rc() { return new Traversal(this); }
        public boolean isRC() { return contigs instanceof ContigListRC; }
        public int getMinMaxObservations() { return minMaxObservations; }
        public boolean isInextensible() { return getLastContig().isCycleMember(); }

        @Override
        public String toString() {
            return contigs.stream()
                    .map(Contig::toString)
                    .collect(Collectors.joining("+"));
        }

        public int getSequenceLength() {
            int len = 0;
            for ( final Contig contig : contigs ) {
                len += contig.getNKmers();
            }
            return len + Kmer.KSIZE - 1;
        }

        public String getSequence() {
            if ( contigs.size() == 0 ) return "";
            final StringBuilder sb =
                    new StringBuilder(contigs.get(0).getSequence().subSequence(0, Kmer.KSIZE - 1));
            for ( final Contig contig : contigs ) {
                final CharSequence seq = contig.getSequence();
                sb.append(seq.subSequence(Kmer.KSIZE - 1, seq.length()));
            }
            return sb.toString();
        }

        @Override public int hashCode() {
            if ( hashCode == 0 ) {
                hashCode = contigs.hashCode();
            }
            return hashCode;
        }

        @Override public boolean equals( final Object obj ) {
            if ( this == obj ) return true;
            if ( !(obj instanceof Traversal) ) return false;
            final Traversal that = (Traversal)obj;
            if ( hashCode != that.hashCode || contigs.size() != that.contigs.size() ) return false;
            return contigs.equals(that.contigs);
        }

        public static Traversal combine( final Traversal trav1, final Traversal trav2 ) {
            return combineOverlappers(trav1, trav2, 1);
        }

        public static Traversal combineOverlappers( final Traversal trav1,
                                                    final Traversal trav2,
                                                    final int overlapLen ) {
            final int len1 = trav1.contigs.size();
            if ( !trav1.contigs.subList(len1 - overlapLen, len1)
                    .equals(trav2.contigs.subList(0, overlapLen)) ) {
                throw new GATKException("combining non-overlapping traversals");
            }
            final int len2 = trav2.contigs.size();
            final List combinedList =
                    new ArrayList<>(trav1.contigs.size() + len2 - overlapLen);
            combinedList.addAll(trav1.contigs);
            combinedList.addAll(trav2.contigs.subList(overlapLen, len2));
            return new Traversal(combinedList);
        }
    }

    /** Set of traversals.
     *  Rejects adding RC's of existing traversals.
     *  Explodes when it gets too big. */
    public static class TraversalSet extends HashSet {
        private static final long serialVersionUID = 1L;
        final int tooManyTraversals;

        public TraversalSet( final int tooManyTraversals ) {
            this.tooManyTraversals = tooManyTraversals;
        }

        @Override public boolean add( final Traversal traversal ) {
            if ( contains(traversal.rc()) ) {
                return false;
            }
            if ( size() >= tooManyTraversals ) {
                throw new AssemblyTooComplexException();
            }
            return super.add(traversal);
        }
    }

    /** Something to throw when we have too many Contigs or Traversals to proceed with assembly. */
    public final static class AssemblyTooComplexException extends RuntimeException {
        static final long serialVersionUID = -1L;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy