 
                        
        
                        
        org.broadinstitute.hellbender.tools.walkers.groundtruth.GroundTruthReadsBuilder Maven / Gradle / Ivy
 The newest version!
        
        package org.broadinstitute.hellbender.tools.walkers.groundtruth;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.Tuple;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.CigarBuilder;
import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.tools.walkers.featuremapping.FlowFeatureMapper;
import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Random;
import java.util.zip.GZIPOutputStream;
/**
 * An internal tool to produce a flexible and robust ground truth set for base calling training.
 *
 *
 *  Input 
 * 
 *     -  Coordinate-sorted and indexed SAM/BAM/CRAM *
-  Maternal and Parental references (fa) *
-  Folder with address translation files from reference to maternal/parental references (filename example: maternal.chr9.csv)*
*
 * Output 
 * 
 *     -  CSV file containing maternal/parental haplotype scores and many more columns (.csv or .csv.gz supported) *
*
 * At present, the output will contain the following columns (not nessarily in this order) for each processed read:
 *
 *     - ReadName, ReadChrom, ReadStart, ReadEnd, tm, mapq, flags, ReadCigar, ReadSequence, ReadUnclippedStart,
 *     ReadUnclippedEnd - information directly extracted from the input read*
- PaternalHaplotypeInterval, BestHaplotypeSequence, PaternalHaplotypeScore - parental haplotype information.
 *     First the read interval is translated into parental space, the haplotype sequence extracted and then scored*
- Maternal* - same for maternal*
- RefHaplotypeScore - score computed from the reference haplotype*
- BestHaplotypeKey - flow based key for the 'best' (score wise) haplotype (out of the maternal/paternal) pair)*
- ConsensusHaplotypeKey - a flow based key constructed from the flow keys of the maternal and paternal
 *     haplotypes, containing only keys that agree (other keys filled with fixed/special value)*
*
 *
 *Usage examples
 * 
 * gatk GroundTruthReadsBuilder \
 *  -R
 *  ../../../ref/Homo_sapiens_assembly38.fasta
 *  -I
 *  150548-UGAv3-4.chr9.cram
 *  --maternal-ref
 *  chr9_HG001_maternal.fa
 *  --paternal-ref
 *  chr9_HG001_paternal.fa
 *  --ancestral-translators-base-path
 *  ./
 *  --output-csv
 *  output-small.csv
 *  --subsampling-ratio
 *  1.0
 *  --max-output-reads
 *  100000000
 *  --intervals
 *  chr9:109991494-109991494
 *  --smith-waterman
 *  FASTEST_AVAILABLE
 *  --likelihood-calculation-engine
 *  FlowBased
 *  -mbq
 *  0
 *  --kmer-size
 *  10
 *  --gt-debug
 *  --output-flow-length
 *  1000
 *  --haplotype-output-padding-size
 *  8
 *  --prepend-sequence
 *  TTTT
 *  --append-sequence
 *  CCCC
 * 
 *
 * {@GATK.walkertype ReadWalker}
 */
@CommandLineProgramProperties(
        summary = "Ground Truth Reads Builder",
        oneLineSummary = "Produces a flexible and robust ground truth set for base calling training",
        programGroup = FlowBasedProgramGroup.class
)
@DocumentedFeature
@ExperimentalFeature
public final class GroundTruthReadsBuilder extends PartialReadWalker {
    // constants
    private static final Logger logger = LogManager.getLogger(GroundTruthReadsBuilder.class);
    public static final int DEFAULT_FILL_VALUE = -65;
    public static final int NONREF_FILL_VALUE = -80;
    public static final int UNKNOWN_FILL_VALUE = -85;
    public static final int SOFTCLIP_FILL_VALUE = -83;
    private static final int EXTRA_FILL_FROM_HAPLOTYPE = 50;
    static final String C_MATERNAL = "maternal";
    static final String C_PATERNAL = "paternal";
    @Argument(fullName = "maternal-ref", doc="maternal reference file")
    public GATKPath maternalRefPath = null;
    @Argument(fullName = "paternal-ref", doc="paternal reference file")
    public GATKPath paternalRefPath = null;
    @Argument(fullName = "ancestral-translators-base-path", doc="base path for ancestral translation ancestral.contig.csv files")
    public GATKPath ancestralTranslatorsBasePath = null;
    @Argument(fullName = "subsampling-ratio", doc = "subsampling ratio, should be between 0 and 1", optional = true)
    public double   subsamplingRatio = 1.0;
    @Argument(fullName = "max-output-reads", doc = "maximal number of reads to output", optional = true)
    public int      maxOutputReads = 20000000;
    @Argument(fullName = "output-flow-length", doc = "Required length of output flows", optional = true)
    public int      outputFlowLength = 0;
    @Argument(fullName = "prepend-sequence", doc = "Sequence to prepend (barcode)", optional = true)
    public String   prependSequence;
    @Argument(fullName = "append-sequence", doc = "Sequence to append (adapter)", optional = true)
    public String   appendSequence;
    @Argument(fullName = "min-mq", doc = "Minimal mapping quality", optional = true)
    public double   minMappingQuality = 0;
    @Argument(fullName = "max-rq", doc = "Maximal read quality", optional = true)
    public double   maxReadQuality = 0;
    @Argument(fullName = "include-supp-align", doc = "Include supplementary alignments", optional = true)
    public boolean  includeSuppAlign = false;
    @Argument(fullName = "min-haplotype-score", doc = "Minimal score (likelihood) on either haplotype", optional = true)
    public double   minHaplotypeScore = 0;
    @Argument(fullName = "min-haplotype-score-delta", doc = "Minimal score (likelihood) delta between haplotypes", optional = true)
    public double   minHaplotypeScoreDelta = 0;
    @Argument(fullName = "haplotype-output-padding-size", doc = "Number of N to append to best haplotype on output", optional = true)
    public int      haplotypeOutputPaddingSize = 8;
    @Argument(fullName = "discard-non-polyt-softclipped-reads", doc = "Discard reads which are softclipped, unless the softclip is polyT, defaults to true", optional = true)
    public boolean  discardNonPolytSoftclippedReads = false;
    @Argument(fullName = "fill-trimmed-reads-Q", doc = "Reads with tm:Q should be filled from haplotype, otherwise (default) filled with -80", optional = true)
    public boolean fillTrimmedReadsQ;
    @Argument(fullName = "fill-trimmed-reads-Z", doc = "Reads with tm:Z should be filled from haplotype, otherwise (default) filled with -80", optional = true)
    public boolean fillTrimmedReadsZ;
    @Argument(fullName = "fill-trimmed-reads", doc = "Reads with tm:Q or tm:Z should be filled from haplotype, otherwise (default) filled with -80", optional = true)
    public boolean fillTrimmedReads;
    @Argument(fullName = "fill-softclipped-reads", doc = "Softclipped reads should be filled from haplotype, otherwise (default) filled with -83", optional = true)
    public boolean fillSoftclippedReads;
    @Argument(fullName = "false-snp-compensation", doc = "skip haplotype bases until same base as read starts (false SNP compensation)", optional = true)
    public boolean falseSnpCompensation;
    @Argument(fullName = "output-csv", doc="main CSV output file. the file containing maternal/parental "
            + "maternal and paternal haplotype sequences and scores (and many more columns). supported file extensions: .csv, .csv.gz.")
    public GATKPath outputCsvPath = null;
    @Hidden
    @Argument(fullName = "gt-debug", doc = "Turn additional internal logging on", optional = true)
    public boolean      debugMode = false;
    @Argument(fullName = "gt-no-output", doc = "do not generate output records", optional = true)
    public boolean      noOutput = false;
    @ArgumentCollection
    public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection();
    @ArgumentCollection
    public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();
    // locals
    private final Random                        random = new Random();
    private int                                 outputReadsCount = 0;
    private ReferenceDataSource                 maternalReference;
    private ReferenceDataSource                 paternalReference;
    private AncestralContigLocationTranslator   locationTranslator;
    private FlowBasedAlignmentLikelihoodEngine likelihoodCalculationEngine;
    private PrintWriter                         outputCsv;
    private int                                 locationTranslationErrors;
    // static/const
    static final private String[]       CSV_FIELD_ORDER = {
            "ReadName", "ReadChrom", "ReadStart", "ReadEnd",
            "PaternalHaplotypeScore", "MaternalHaplotypeScore", "RefHaplotypeScore",
            "ReadKey", "BestHaplotypeKey", "ConsensusHaplotypeKey",
            "tm", "mapq", "flags", "ReadCigar",
            "ReadSequence", "PaternalHaplotypeSequence", "MaternalHaplotypeSequence", "BestHaplotypeSequence",
            "ReadUnclippedStart", "ReadUnclippedEnd", "PaternalHaplotypeInterval", "MaternalHaplotypeInterval"
    };
    private static class ScoredHaplotype {
        ReferenceContext        ref;
        ReferenceContext        clippedRef;
        ReferenceContext        unclippedRef;
        int                     softclipFrontFillCount;
        Haplotype               haplotype;
        double                  score;
    }
    @Override
    public void onTraversalStart() {
        super.onTraversalStart();
        // initialize references
        maternalReference = ReferenceDataSource.of(maternalRefPath.toPath());
        paternalReference = ReferenceDataSource.of(paternalRefPath.toPath());
        locationTranslator = new AncestralContigLocationTranslator(ancestralTranslatorsBasePath);
        // create likelihood engine
        ReadLikelihoodCalculationEngine engine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(likelihoodArgs, false);
        if ( engine instanceof FlowBasedAlignmentLikelihoodEngine) {
            likelihoodCalculationEngine = (FlowBasedAlignmentLikelihoodEngine)engine;
        } else {
            throw new GATKException("must use a flow based likelihood calculation engine");
        }
        // open output, write header
        try {
            if (outputCsvPath.toPath().toString().endsWith(".gz")) {
                outputCsv = new PrintWriter(new GZIPOutputStream(outputCsvPath.getOutputStream()));
            } else {
                outputCsv = new PrintWriter(outputCsvPath.getOutputStream());
            }
        } catch (IOException e) {
            throw new GATKException("failed to open csv output: " + outputCsvPath, e);
        }
        emitCsvHeaders();
    }
    @Override
    public void closeTool() {
        if ( locationTranslationErrors != 0 ) {
            logger.warn("" + locationTranslationErrors + " location translation errors detected");
        }
        outputCsv.close();
        super.closeTool();
    }
    @Override
    protected boolean shouldExitEarly(GATKRead read) {
        // limit number of output reads
        return ( maxOutputReads != 0) && (outputReadsCount >= maxOutputReads);
    }
    @Override
    public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
        // filter out due to mapping quality
        if ( minMappingQuality != 0 && read.getMappingQuality() < minMappingQuality ) {
            return;
        }
        // supplemental alignment filter
        if ( read.isSupplementaryAlignment() && !includeSuppAlign ) {
            return;
        }
        // discard because softclipped
        if ( discardNonPolytSoftclippedReads && isEndSoftclipped(read) && !isEndPolyTSoftclipped(read) ) {
            return;
        }
        // subsample
        // NOTE: this is done BEFORE read quality and haplotype scoring
        if ( random.nextDouble() > subsamplingRatio ) {
            return;
        }
        // filter out due to read quality
        FlowBasedRead                   flowRead = null;
        if ( maxReadQuality != 0 ) {
            flowRead = buildFlowRead(read);
            if (getFlowBasedReadQuality(flowRead, flowRead.getMaxHmer()) > maxReadQuality) {
                return;
            }
        }
        // process the read
        try {
            // make sure we have a flow read
            if ( flowRead == null ) {
                flowRead = buildFlowRead(read);
            }
            // prepare
            final ScoredHaplotype                   maternal = new ScoredHaplotype();
            final ScoredHaplotype                   paternal = new ScoredHaplotype();
            // translate location to ascentors
            final Tuple  ancestralLocs = locationTranslator.translate(read);
            maternal.ref = new ReferenceContext(maternalReference, ancestralLocs.a);
            paternal.ref = new ReferenceContext(paternalReference, ancestralLocs.b);
            // build haplotypes
            maternal.haplotype = buildReferenceHaplotype(maternal.ref, read);
            paternal.haplotype = buildReferenceHaplotype(paternal.ref, read);
            buildExtendedRef(maternal, maternalReference, ancestralLocs.a, read);
            buildExtendedRef(paternal, paternalReference, ancestralLocs.b, read);
            // generate score for reference
            final double        refScore = scoreReadAgainstReference(read, referenceContext);
            // score read against haplotypes, create flow versions of read nad haplotype
            if ( areSame(maternal.haplotype, referenceContext, read.isReverseStrand()) ) {
                maternal.score = refScore;
            } else {
                maternal.score = scoreReadAgainstHaplotype(read, maternal);
            }
            if ( areSame(paternal.haplotype, referenceContext, read.isReverseStrand()) ) {
                paternal.score = refScore;
            } else if ( arsSame(maternal.haplotype, paternal.haplotype, read.isReverseStrand()) ) {
                paternal.score = scoreReadAgainstHaplotype(read, paternal);
            } else {
                paternal.score = scoreReadAgainstHaplotype(read, paternal);
            }
            // debug printing (in INFO for now, will be changed to DEBUG)
            debugLog(read, referenceContext, maternal, paternal);
            // filter on min score
            // TODO: this is probaby wrong since the scores are negative. To be handled later
            if ( minHaplotypeScore != 0 && Math.min(maternal.score, paternal.score) > minHaplotypeScore ) {
                return;
            }
            // filter on score delta
            if ( minHaplotypeScoreDelta != 0 && Math.abs(maternal.score - paternal.score) > minHaplotypeScoreDelta ) {
                return;
            }
            // if here, emit this read
            outputReadsCount++;
            emit(read, flowRead, refScore, maternal, paternal);
        } catch (LocationTranslationException e) {
            logger.warn("location translation exception: " + e.getMessage());
            locationTranslationErrors++;
        } catch (IOException e) {
            throw new GATKException("failed to process read: " + read.getName(), e);
        }
    }
    private boolean shouldFillFromHaplotype(final GATKRead read) {
        // softclip has priori
        if ( isEndSoftclipped(read) )
            return fillSoftclippedReads;
        // extending timmed as well?
        final String    tm = read.getAttributeAsString(FlowBasedRead.CLIPPING_TAG_NAME);
        if ( tm == null ) {
            return true;
        } else {
            boolean             hasA = tm.indexOf('A') >= 0;
            boolean             hasQ = tm.indexOf('Q') >= 0;
            boolean             hasZ = tm.indexOf('Z') >= 0;
            if ( hasA ) {
                return false;
            }
            else if ( hasZ && (fillTrimmedReads || fillTrimmedReadsZ) ) {
                return true;
            }
            else if ( hasQ && (fillTrimmedReads || fillTrimmedReadsQ) ) {
                return true;
            } else {
                return false;
            }
        }
    }
    private void buildExtendedRef(final ScoredHaplotype scoredHaplotype, final ReferenceDataSource ref, final SimpleInterval loc, final GATKRead read) {
        // assume no extension
        int     extendStart = 0;
        int     extendEnd = 0;
        // calc soft extension
        if ( fillSoftclippedReads ) {
            final CigarElement elem = !read.isReverseStrand()
                    ? read.getCigar().getLastCigarElement() : read.getCigar().getFirstCigarElement();
            if (elem.getOperator() == CigarOperator.S) {
                if (!read.isReverseStrand()) {
                    extendEnd += elem.getLength();
                } else {
                    extendStart += elem.getLength();
                }
            }
        }
        // add padding
        if ( !read.isReverseStrand() ) {
            extendEnd += haplotypeOutputPaddingSize;
        } else {
            extendStart += haplotypeOutputPaddingSize;
        }
        // add extra fill from haplotype
        if ( (outputFlowLength != 0) && shouldFillFromHaplotype(read) ) {
            int     length = (loc.getEnd() + extendEnd) - (loc.getStart() - extendStart);
            int     delta = Math.max(0, outputFlowLength - length) + EXTRA_FILL_FROM_HAPLOTYPE;
            if ( !read.isReverseStrand() ) {
                extendEnd += delta;
            } else {
                extendStart += delta;
            }
        }
        // compensate for skipage of first hmer
        final int delta = scoredHaplotype.ref.getInterval().size() - scoredHaplotype.haplotype.getGenomeLocation().getLengthOnReference();
        if ( delta != 0 ) {
            if ( !read.isReverseStrand() ) {
                extendStart -= delta;
            } else {
                extendEnd -= delta;
            }
        }
        int         minStart = ref.getSequenceDictionary().getSequence(loc.getContig()).getStart();
        int         maxEnd = ref.getSequenceDictionary().getSequence(loc.getContig()).getEnd();
        if ( isStartSoftclipped(read) ) {
            scoredHaplotype.clippedRef = new ReferenceContext(ref,
                    new SimpleInterval(loc.getContig(), Math.max(minStart, loc.getStart() - extendStart), Math.min(maxEnd, loc.getEnd() + extendEnd)));
        }
        // add front unclipped
        {
            final CigarElement frontElem = !read.isReverseStrand()
                    ? read.getCigar().getFirstCigarElement() : read.getCigar().getLastCigarElement();
            if (frontElem.getOperator() == CigarOperator.S) {
                if (!read.isReverseStrand()) {
                    extendStart += frontElem.getLength();
                } else {
                    extendEnd += frontElem.getLength();
                }
            }
            scoredHaplotype.unclippedRef = new ReferenceContext(ref,
                    new SimpleInterval(loc.getContig(), Math.max(minStart, loc.getStart() - extendStart), Math.min(maxEnd, loc.getEnd() + extendEnd)));
        }
    }
    private boolean arsSame(final Haplotype h1, final Haplotype h2, boolean isReversed) {
        return Arrays.equals(h1.getBases(), h2.getBases());
    }
    private boolean areSame(final Haplotype h, final ReferenceContext ref, boolean isReversed) {
        return Arrays.equals(h.getBases(), reverseComplement(ref.getBases(), isReversed));
    }
    private FlowBasedRead buildFlowRead(final GATKRead read) {
        FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
        return new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
    }
    private boolean isEndSoftclipped(final GATKRead read) {
        if ( !read.isReverseStrand() ) {
            return read.getCigar().getLastCigarElement().getOperator() == CigarOperator.S;
        } else {
            return read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.S;
        }
    }
    private boolean isStartSoftclipped(final GATKRead read) {
        if ( !read.isReverseStrand() ) {
            return read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.S;
        } else {
            return read.getCigar().getLastCigarElement().getOperator() == CigarOperator.S;
        }
    }
    private boolean isEndPolyTSoftclipped(final GATKRead read) {
        // must be softclipped
        if ( !isEndSoftclipped(read) )
            return false;
        // are all softclipped bases T
        final byte[]      bases = read.getBasesNoCopy();
        if ( !read.isReverseStrand() ) {
            final int      length = read.getCigar().getFirstCigarElement().getLength();
            for ( int n = 0 ; n < length ; n++ ) {
                if (bases[n] != 'T') {
                    return false;
                }
            }
        } else {
            final int      length = read.getCigar().getLastCigarElement().getLength();
            for ( int n = 0 ; n < length ; n++ ) {
                if (bases[bases.length - n - 1] != 'A') {
                    return false;
                }
            }
        }
        // if here all softclipped bases are T
        return true;
    }
    private void debugLog(final GATKRead read, final ReferenceContext referenceContext, final ScoredHaplotype maternal, final ScoredHaplotype paternal) {
        if ( debugMode ) {
            logger.info("read: " + read.getName() + " " + read.getCigar() + " " + read.getFlags());
            logger.info("read:          " + new SimpleInterval(read) + " " + new String(read.getBases()));
            logger.info("ref:           " + new SimpleInterval(referenceContext) + " " + new String(referenceContext.getBases()));
            logger.info("mRef: " + maternal.ref.getInterval() + " " + new String(maternal.ref.getBases()));
            logger.info("pRef: " + paternal.ref.getInterval() + " " + new String(paternal.ref.getBases()));
            logger.info("pmDiff:                                 " + new String(debugBinDiff(maternal.ref.getBases(), paternal.ref.getBases())));
            logger.info("mHap: " + maternal.score + " " + maternal.haplotype);
            logger.info("pHap: " + paternal.score + " " + paternal.haplotype);
        }
    }
    private byte[] debugBinDiff(final byte[] b1, final byte[] b2) {
        final int     len = Math.min(b1.length, b2.length);
        final byte[]  result = new byte[len];
        for ( int n = 0 ; n < len ; n++ ) {
            result[n] = (b1[n] == b2[n]) ? (byte)'_' : (byte)'1';
        }
        return result;
    }
    private double getFlowBasedReadQuality(final FlowBasedRead read, final int maxClass) {
        double      sum = 0;
        for ( int n = 0 ; n < read.getKeyLength() ; n++ ) {
            sum += read.getProb(n, maxClass);
        }
        return sum;
    }
    private Haplotype buildReferenceHaplotype(final ReferenceContext ref, final GATKRead read) {
        Locatable loc = new SimpleInterval(ref.getInterval());
        byte[]          haplotypeBases = reverseComplement(ref.getBases(), read.isReverseStrand());
        final byte[]    readBases = reverseComplement(getSoftclippedBases(read), read.isReverseStrand());
        // skip haplotype bases until same base as read starts (false SNP compensation)
        if ( falseSnpCompensation && (haplotypeBases[0] != readBases[0]) ) {
            final int       skip = detectFalseSNP(haplotypeBases, readBases);
            if ( skip != 0 ) {
                haplotypeBases = Arrays.copyOfRange(haplotypeBases, skip, haplotypeBases.length);
                if ( !read.isReverseStrand() ) {
                    loc = new SimpleInterval(loc.getContig(), loc.getStart() + skip, loc.getEnd());
                } else {
                    loc = new SimpleInterval(loc.getContig(), loc.getStart(), loc.getEnd() - skip);
                }
            }
        }
        final Haplotype haplotype = new Haplotype(haplotypeBases, loc);
        haplotype.setCigar(new CigarBuilder(false)
                .add(new CigarElement(haplotype.length(), CigarOperator.M)).make());
        return haplotype;
    }
    private byte[] getSoftclippedBases(final GATKRead read) {
        final int   startClip = (read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.SOFT_CLIP)
                                    ? read.getCigar().getFirstCigarElement().getLength() : 0;
        final int   endClip = (read.getCigar().getLastCigarElement().getOperator() == CigarOperator.SOFT_CLIP)
                ? read.getCigar().getLastCigarElement().getLength() : 0;
        final byte[] bases = read.getBasesNoCopy();
        if ( startClip == 0 && endClip == 0 ) {
            return bases;
        } else {
            return Arrays.copyOfRange(bases, startClip, bases.length - endClip);
        }
    }
    private int detectFalseSNP(final byte[] haplotypeBases, final byte[] readBases) {
        // parametrisation
        final int       maxSkipageSize = 5;     // will not skip more than this
        final int       requiredRemainingMatchSize = 5; // after skipage, this number of bases must match
        // this might be redundant, but just in case
        if ( haplotypeBases[0] == readBases[0] )
            return 0;
        // establish sizes of homopolymer on read
        int             readHomoSize = 0;
        for ( ; readHomoSize < readBases.length ; readHomoSize++ )
            if ( readBases[readHomoSize] != readBases[0] )
                break;
        // loop on possible skipage
        for ( int skip = 1 ; skip <= maxSkipageSize ; skip++ ) {
            // there must be enough bases
            if ( skip + requiredRemainingMatchSize > haplotypeBases.length ) {
                break;
            }
            if ( Math.max(skip + 1, requiredRemainingMatchSize) > readBases.length ) {
                break;
            }
            // skipage + 1 must be inside homo polymer
            if ( skip + 1 > readHomoSize ) {
                break;
            }
            // remaining bases must match
            if ( arrayRangeEquals(readBases, skip, requiredRemainingMatchSize, haplotypeBases, skip) ) {
                // found skipape point
                return skip;
            }
        }
        // if here, did not find skipage point
        return 0;
    }
    private boolean arrayRangeEquals(final byte[] a1, final int ofs1, final int len, final byte[] a2, final int ofs2) {
        for ( int i = 0 ; i < len ; i++ ) {
            if (a1[ofs1 + i] != a2[ofs2 + i]) {
                return false;
            }
        }
        return true;
    }
    private double scoreReadAgainstHaplotype(final GATKRead read, final ScoredHaplotype sh) {
        // build haplotypes
        final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
        final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(sh.haplotype, rgInfo.flowOrder);
        // create flow read
        final FlowBasedRead   flowRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
        if ( read.isReverseStrand() ) {
            flowRead.setDirection(FlowBasedRead.Direction.SYNTHESIS);
            flowRead.applyAlignment();
        }
        if ( !flowRead.isValid() ) {
            return -1;
        }
        // compute alternative score
        final int         hapKeyLength = flowHaplotype.getKeyLength();
        final double      score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false);
        return score;
    }
    private double scoreReadAgainstReference(final GATKRead read, final ReferenceContext ref) {
        // build haplotypes
        final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
        final FlowBasedHaplotype      flowHaplotype = new FlowBasedHaplotype(buildReferenceHaplotype(ref, read), rgInfo.flowOrder);
        // create flow read
        final FlowBasedRead           flowRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
        if ( read.isReverseStrand() ) {
            flowRead.setDirection(FlowBasedRead.Direction.SYNTHESIS);
            flowRead.applyAlignment();
        }
        if ( !flowRead.isValid() ) {
            return -1;
        }
        // compute alternative score
        final int         hapKeyLength = flowHaplotype.getKeyLength();
        final double      score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false);
        // debug
        if ( debugMode ) {
            logger.info("flowRead: " + flowRead);
            logger.info("flowHaplotype: " + flowHaplotype);
            logger.info("flowRead.key:      " + Arrays.toString(flowRead.getKey()));
            logger.info("flowHaplotype.key: " + Arrays.toString(flowHaplotype.getKey()));
            logger.info("scoreReadAgainstReference: score: " + score);
        }
        return score;
    }
    private int[] buildHaplotypeKey(final String haplotypeSeq, final FlowBasedReadUtils.ReadGroupInfo rgInfo, final boolean isReversed) {
        // create a haplotype to contain the sequence
        final byte[]              seq = reverseComplement(haplotypeSeq.getBytes(), isReversed);
        final Haplotype           h = new Haplotype(seq);
        final FlowBasedHaplotype  flowHaplotype = new FlowBasedHaplotype(h, !isReversed ? rgInfo.flowOrder : rgInfo.getReversedFlowOrder());
        // need to start on a T - find out T offset on the flow order
        int[]                     hapKey = flowHaplotype.getKey();
        byte[]                    hapFlowOrder = flowHaplotype.getFlowOrderArray();
        int                       appendZeroCount = 0;
        while ( hapKey[0] == 0 ) {
            hapKey = Arrays.copyOfRange(hapKey, 1, hapKey.length);
        }
        if ( (seq[0] != 'T') && (seq[0] != 'N') ) {
            int     ofs = 0;
            while ( hapFlowOrder[ofs] != 'T' )
                ofs++;
            while ( hapFlowOrder[ofs] != seq[0] ) {
                appendZeroCount++;
                ofs = (ofs + 1) % hapFlowOrder.length;
            }
        }
        if ( appendZeroCount == 0 ) {
            return hapKey;
        } else {
            int[]       hapKey1 = new int[appendZeroCount + hapKey.length];
            System.arraycopy(hapKey, 0, hapKey1, appendZeroCount, hapKey.length);
            return hapKey1;
        }
    }
    private int[] buildHaplotypeKeyForOutput(ScoredHaplotype scoredHaplotype, final FlowBasedReadUtils.ReadGroupInfo rgInfo, final int fillValue, final GATKRead read) {
        boolean                   isReversed = read.isReverseStrand();
        // create key from filled and unclipped version
        int[]                     hapKey = buildHaplotypeKey(new String(scoredHaplotype.unclippedRef.getBases()), rgInfo, isReversed);
        if ( isStartSoftclipped(read) ) {
            int[] hapKeyClipped = buildHaplotypeKey(new String(scoredHaplotype.clippedRef.getBases()), rgInfo, isReversed);
            scoredHaplotype.softclipFrontFillCount = hapKey.length - hapKeyClipped.length;
        } else {
            scoredHaplotype.softclipFrontFillCount = 0;
        }
        // prepare key
        final int flowLength = (outputFlowLength != 0) ? outputFlowLength : hapKey.length;
        final int[] key = new int[flowLength];
        int ofs;
        System.arraycopy(hapKey, 0, key, 0, ofs = Math.min(flowLength, hapKey.length));
        // adjust to a fixed length
        for (  ; ofs < flowLength ; ofs++ ) {
            key[ofs] = fillValue;
        }
        return key;
    }
    private String buildHaplotypeSequenceForOutput(final ScoredHaplotype haplotype, final boolean isReversed, final int keyBaseCount) {
        final StringBuilder      sb = new StringBuilder();
        if ( prependSequence != null ) {
            sb.append(prependSequence);
        }
        final String            seq = new String(reverseComplement(haplotype.unclippedRef.getBases(), isReversed));
        final String            baseCountSeq = seq.substring(0, keyBaseCount);
        sb.append(baseCountSeq);
        if ( appendSequence != null ) {
            sb.append(appendSequence);
        }
        return sb.toString();
    }
    private int[] buildConsensusKey(final int[] k1, final int[] k2) {
        final int         len = Math.min(k1.length, k2.length);
        final int[]       key = new int[len];
        for ( int n = 0 ; n < len ; n++ ) {
            key[n] = (k1[n] == k2[n]) ? k1[n] : -72;
        }
        return key;
    }
    private String flowKeyAsCsvString(final int[] key) {
        return "\"" + Arrays.toString(key).replaceAll("\\[|\\]|\\s", "") + "\"";
    }
    private String flowKeyAsCsvString(int[] key, final String seq, final String flowOrder) {
        final StringBuilder     sb = new StringBuilder();
        sb.append("\"");
        while ( key[0] == 0 ) {
            key = Arrays.copyOfRange(key, 1, key.length);
        }
        if ( (seq.charAt(0) != 'T') && (seq.charAt(0) != 'N') ) {
            int     ofs = 0;
            while ( flowOrder.charAt(ofs) != 'T' )
                ofs++;
            while ( flowOrder.charAt(ofs) != seq.charAt(0) ) {
                sb.append("0,");
                ofs = (ofs + 1) % flowOrder.length();
            }
        }
        sb.append(Arrays.toString(key).replaceAll("\\[|\\]|\\s", ""));
        sb.append("\"");
        return sb.toString();
    }
    private void emitCsvHeaders() {
        outputCsv.println(StringUtils.join(CSV_FIELD_ORDER, ","));
    }
    private void emit(final GATKRead read, final FlowBasedRead flowRead, final double refScore, final ScoredHaplotype maternal, final ScoredHaplotype paternal) throws IOException {
        // build line columns
        final Map        cols = new LinkedHashMap<>();
        // establish fill value
        final String        tm = read.getAttributeAsString(FlowBasedRead.CLIPPING_TAG_NAME);
        boolean             hasA = (tm != null) && tm.indexOf('A') >= 0;
        boolean             hasQ = (tm != null) && tm.indexOf('Q') >= 0;
        boolean             hasZ = (tm != null) && tm.indexOf('Z') >= 0;
        int                 fillValue;
        if ( isEndSoftclipped(read) )
            fillValue = SOFTCLIP_FILL_VALUE;
        else if ( hasQ || hasZ ) {
            fillValue = hasA ? UNKNOWN_FILL_VALUE : NONREF_FILL_VALUE;
        } else {
            fillValue = DEFAULT_FILL_VALUE;
        }
        // read name
        cols.put("ReadName", read.getName());
        // haplotypes and reference scores
        cols.put("PaternalHaplotypeScore", String.format("%.6f", paternal.score));
        cols.put("MaternalHaplotypeScore", String.format("%.6f", maternal.score));
        cols.put("RefHaplotypeScore", String.format("%.6f", refScore));
        // build haplotype keys
        final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
        final int[]           paternalHaplotypeKey = buildHaplotypeKeyForOutput(paternal, rgInfo,fillValue, read);
        final int[]           maternalHaplotypeKey = buildHaplotypeKeyForOutput(maternal, rgInfo,fillValue, read);
        // build haplotype sequence
        final String           paternalHaplotypeSeq = buildHaplotypeSequenceForOutput(paternal, read.isReverseStrand(), keyBases(paternalHaplotypeKey));
        final String           maternalHaplotypeSeq = buildHaplotypeSequenceForOutput(maternal, read.isReverseStrand(), keyBases(maternalHaplotypeKey));
        // fill softclip at front
        softclipFill(paternal, paternalHaplotypeKey);
        softclipFill(maternal, maternalHaplotypeKey);
        // select best and establish consensus
        final boolean          ancestralHaplotypesSame = paternalHaplotypeSeq.equals(maternalHaplotypeSeq);
        final ScoredHaplotype  bestHaplotype = (paternal.score > maternal.score) ? paternal: maternal;
        final int[]            bestHaplotypeKey = (bestHaplotype == paternal) ? paternalHaplotypeKey : maternalHaplotypeKey;
        final int[]            consensus = buildConsensusKey(paternalHaplotypeKey, maternalHaplotypeKey);
        // emit best haplotype
        cols.put("BestHaplotypeSequence", (bestHaplotype == paternal) ? paternalHaplotypeSeq : maternalHaplotypeSeq);
        if ( !ancestralHaplotypesSame )
            cols.put("BestHaplotypeKey", flowKeyAsCsvString(bestHaplotypeKey));
        else
            cols.put("BestHaplotypeKey", flowKeyAsCsvString(consensus));
        // write consensus haplotype
        cols.put("ConsensusHaplotypeKey", flowKeyAsCsvString(consensus));
        // additional  fields
        cols.put("ReadChrom", read.getContig());
        cols.put("ReadStart", read.getStart());
        cols.put("ReadEnd", read.getEnd());
        cols.put("ReadUnclippedStart", read.getUnclippedStart());
        cols.put("ReadUnclippedEnd", read.getUnclippedEnd());
        cols.put("ReadCigar", read.getCigar());
        final String    readSeq = reverseComplement(read.getBasesString(), read.isReverseStrand());
        final int[]     readKey = !read.isReverseStrand() ? flowRead.getKey() : reversedCopy(flowRead.getKey());
        final String    readFlowOrder = reverseComplement(FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read).flowOrder, read.isReverseStrand());
        cols.put("ReadSequence", readSeq);
        cols.put("ReadKey", flowKeyAsCsvString(readKey, readSeq, readFlowOrder));
        cols.put("PaternalHaplotypeInterval", paternal.ref.getInterval());
        cols.put("PaternalHaplotypeSequence", paternalHaplotypeSeq);
        cols.put("MaternalHaplotypeInterval", maternal.ref.getInterval());
        cols.put("MaternalHaplotypeSequence", maternalHaplotypeSeq);
        cols.put("tm", (read.hasAttribute(FlowBasedRead.CLIPPING_TAG_NAME) ? read.getAttributeAsString(FlowBasedRead.CLIPPING_TAG_NAME) : ""));
        cols.put("mapq", read.getMappingQuality());
        cols.put("flags", read.getFlags());
        // construct line
        StringBuilder       sb = new StringBuilder();
        int                 colIndex = 0;
        for ( String field : CSV_FIELD_ORDER ) {
            if ( colIndex++ > 0 ) {
                sb.append(',');
            }
            if ( !cols.containsKey(field) ) {
                throw new GATKException("column missing from csv line: " + field);
            }
            sb.append(cols.get(field));
            cols.remove(field);
        }
        if ( cols.size() > 0 ) {
            throw new GATKException("invalid columns on csv line: " + cols.keySet());
        }
        // output line
        if ( !noOutput ) {
            outputCsv.println(sb);
        }
    }
    private void softclipFill(ScoredHaplotype scoredHaplotype, int[] key) {
        if ( !fillSoftclippedReads ) {
            int     limit = Math.min(scoredHaplotype.softclipFrontFillCount, key.length);
            for (int n = 0; n < limit ; n++) {
                key[n] = SOFTCLIP_FILL_VALUE;
            }
        }
    }
    private int keyBases(int[] key) {
        int     count = 0;
        for ( int c : key ) {
            if (c > 0) {
                count += c;
            }
        }
        return count;
    }
    private byte[] reverseComplement(final byte[] bases) {
        final byte[] result = new byte[bases.length];
        System.arraycopy(bases, 0, result, 0, result.length);
        SequenceUtil.reverseComplement(result);
        return result;
    }
    private byte[] reverseComplement(final byte[] bases, final boolean isReversed) {
        return !isReversed ? bases : reverseComplement(bases);
    }
    private String reverseComplement(final String bases) {
        return new String(reverseComplement(bases.getBytes()));
    }
    private String reverseComplement(final String bases, final boolean isReversed) {
        return !isReversed ? bases : reverseComplement(bases);
    }
    private int[] reversedCopy(final int[] bytes) {
        int[] copy = ArrayUtils.clone(bytes);
        ArrayUtils.reverse(copy);
        return copy;
    }
}
  © 2015 - 2025 Weber Informatics LLC | Privacy Policy