All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.illumina.parser.BclParser Maven / Gradle / Ivy

Go to download

A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2012 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.illumina.parser;


import htsjdk.samtools.util.CloseableIterator;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.illumina.parser.readers.BclReader;

import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

import static htsjdk.samtools.util.CollectionUtil.makeSet;

/**
 * BclParser parses a number of BclFiles equal to the total of all the values in outputLengths and returns a BclData object
 * segmented based on these lengths.  The only client of this class should be IlluminaDataProvider and an test classes.  See BclReader for
 * more information on BclFiles.  BclParser provides support for reading BaseCalls and QualityScores.
 */
class BclParser extends PerTileCycleParser {
    private static final int EAMSS_M2_GE_THRESHOLD = 30;
    private static final int EAMSS_S1_LT_THRESHOLD = 15; //was 15
    public static final byte MASKING_QUALITY = (byte) 0x02;

    private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(makeSet(IlluminaDataType.BaseCalls, IlluminaDataType.QualityScores));

    protected final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;
    private final boolean applyEamssFilter;

    public BclParser(final File directory, final int lane, final CycleIlluminaFileMap tilesToCycleFiles, final OutputMapping outputMapping, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy) {
        this(directory, lane, tilesToCycleFiles, outputMapping, true, bclQualityEvaluationStrategy);
        this.initialize();
    }

    public BclParser(final File directory, final int lane, final CycleIlluminaFileMap tilesToCycleFiles, final OutputMapping outputMapping, final boolean applyEamssFilter, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy) {
        super(directory, lane, tilesToCycleFiles, outputMapping);
        this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
        this.applyEamssFilter = applyEamssFilter;
        this.initialize();
    }

    /**
     * Create a Bcl parser for an individual cycle and wrap it with the CycleFilesParser interface which populates
     * the correct cycle in BclData.
     *
     * @param files The files to parse.
     * @return A CycleFilesParser that populates a BclData object with data for a single cycle
     */
    @Override
    protected CycleFilesParser makeCycleFileParser(final List files) {
        return new BclDataCycleFileParser(files);
    }

    @Override
    public void initialize() {
        seekToTile(currentTile);
    }

    @Override
    public Set supportedTypes() {
        return SUPPORTED_TYPES;
    }

    @Override
    public BclData next() {
        final BclData bclData = super.next();

        final byte[][] bases = bclData.getBases();
        final byte[][] qualities = bclData.getQualities();

        //first run EAMSS
        if (this.applyEamssFilter) {
            for (int i = 0; i < bases.length; i++) {
                runEamssForReadInPlace(bases[i], qualities[i]);
            }
        }

        return bclData;
    }

    /**
     * EAMSS is an Illumina Developed Algorithm for detecting reads whose quality has deteriorated towards
     * their end and revising the quality to the masking quality (2) if this is the case.  This algorithm
     * works as follows (with one exception):
     * 

* Start at the end (high indices, at the right below) of the read and calculate an EAMSS tally at each * location as follow: * if(quality[i] < 15) tally += 1 * if(quality[i] >= 15 and < 30) tally = tally * if(quality[i] >= 30) tally -= 2 *

*

* For each location, keep track of this tally (e.g.) * Read Starts at <- this end * Cycle: 1 2 3 4 5 6 7 8 9 * Bases: A C T G G G T C A * Qualities: 32 32 16 15 8 10 32 2 2 * Cycle Score: -2 -2 0 0 1 1 -2 1 1 //The EAMSS Score determined for this cycle alone * EAMSS TALLY: 0 0 2 2 2 1 0 2 1 * X - Earliest instance of Max-Score *

* You must keep track of the maximum EAMSS tally (in this case 2) and the earliest(lowest) cycle at which * it occurs. If and only if, the max EAMSS tally >= 1 then from there until the end(highest cycle) of the * read reassign these qualities as 2 (the masking quality). The output qualities would therefore be * transformed from: *

* Original Qualities: 32 32 16 15 8 10 32 2 2 to * Final Qualities: 32 32 2 2 2 2 2 2 2 * X - Earliest instance of max-tally/end of masking *

* IMPORTANT: * The one exception is: If the max EAMSS Tally is preceded by a long string of G basecalls (10 or more, with a single basecall exception per10 bases) * then the masking continues to the beginning of that string of G's. E.g.: *

* Cycle: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 * Bases: C T A C A G A G G G G G G G G C A T * Qualities: 30 22 26 27 28 30 7 34 20 19 38 15 32 32 10 4 2 5 * Cycle Score: -2 0 0 0 0 -2 1 -2 0 0 -2 0 -2 -2 1 1 1 1 * EAMSS TALLY: -2 -5 -5 -5 -5 -5 -3 -4 -2 -2 -2 0 0 2 4 3 2 1 * X- Earliest instance of Max-Tally *

* Resulting Transformation: * Bases: C T A C A G A G G G G G G G G C A T * Original Qualities: 30 22 26 27 28 30 7 34 20 19 38 15 32 32 10 4 2 5 * Final Qualities: 30 22 26 27 28 2 2 2 2 2 2 2 2 2 2 2 2 2 * X- Earliest instance of Max-Tally * X - Start of EAMSS masking due to G-Run *

* To further clarify the exception rule here are a few examples: * A C G A C G G G G G G G G G G G G G G G G G G G G A C T * X - Earliest instance of Max-Tally * X - Start of EAMSS masking (with a two base call jump because we have 20 bases in the run already) *

* T T G G A G G G G G G G G G G G G G G G G G G A G A C T * X - Earliest instance of Max-Tally * X - We can skip this A as well as the earlier A because we have 20 or more bases in the run already * X - Start of EAMSS masking (with a two base call jump because we have 20 bases in the run) *

* T T G G G A A G G G G G G G G G G G G G G G G G G T T A T * X - Earliest instance of Max-Tally * X X - WE can skip these bases because the first A counts as the first skip and as far as the length of the string of G's is * concerned, these are both counted like G's * X - This A is the 20th base in the string of G's and therefore can be skipped * X - Note that the A's previous to the G's are only included because there are G's further on that are within the number * of allowable exceptions away (i.e. 2 in this instance), if there were NO G's after the A's you CANNOT count the A's * as part of the G strings (even if no exceptions have previously occured) In other words, the end of the string of G's * MUST end in a G not an "exception" *

* However, if the max-tally occurs to the right of the run of Gs then this is still part of the string of G's but does count towards * the number of exceptions allowable * (e.g.) * T T G G G G G G G G G G A C G * X - Earliest instance of Max-tally * The first index CAN be considered as an exception, the above would be masked to * the following point: * T T G G G G G G G G G G A C G * X - End of EAMSS masking due to G-Run *

* To sum up the final points, a string of G's CAN START with an exception but CANNOT END in an exception. * * @param bases Bases for a single read in the cluster ( not the entire cluster ) * @param qualities Qualities for a single read in the cluster ( not the entire cluster ) */ protected static void runEamssForReadInPlace(final byte[] bases, final byte[] qualities) { int eamssTally = 0; int maxTally = Integer.MIN_VALUE; int indexOfMax = -1; for (int i = bases.length - 1; i >= 0; i--) { final int quality = (0xff & qualities[i]); if (quality >= EAMSS_M2_GE_THRESHOLD) { eamssTally -= 2; } else if (quality < EAMSS_S1_LT_THRESHOLD) { eamssTally += 1; } if (eamssTally >= maxTally) { indexOfMax = i; maxTally = eamssTally; } } if (maxTally >= 1) { int numGs = 0; int exceptions = 0; for (int i = indexOfMax; i >= 0; i--) { if (bases[i] == 'G') { ++numGs; } else { final Integer skip = skipBy(i, numGs, exceptions, bases); if (skip != null) { exceptions += skip; numGs += skip; i -= (skip - 1); } else { break; } } } if (numGs >= 10) { indexOfMax = (indexOfMax + 1) - numGs; } for (int i = indexOfMax; i < qualities.length; i++) { qualities[i] = MASKING_QUALITY; } } } /** * Determine whether or not the base at index is part of a skippable section in a run of G's, if so * return the number of bases that the section is composed of. * * @param index Current index, which should be the index of a non-'G' base * @param numGs The number of bases in the current string of G's for this read * @param prevExceptions The number of exceptions previously detected in this string by this method * @param bases The bases of this read * @return If we have not reached our exception limit (1/every 10bases) and a G is within exceptionLimit(numGs/10) * indices before the current index then return index - (index of next g), else return null Null indicates this is * NOT a skippable region, if we run into index 0 without finding a g then NULL is also returned */ private static Integer skipBy(final int index, final int numGs, final int prevExceptions, final byte[] bases) { Integer skip = null; for (int backup = 1; backup <= index; backup++) { final int exceptionLimit = Math.max((numGs + backup) / 10, 1); if (prevExceptions + backup > exceptionLimit) { break; } if (bases[index - backup] == 'G') { skip = backup; break; } } return skip; } private class BclDataCycleFileParser implements CycleFilesParser { final CloseableIterator reader; public BclDataCycleFileParser(final List files) { reader = new BclReader(files, outputMapping.getOutputReadLengths(), bclQualityEvaluationStrategy, false); } @Override public void close() { reader.close(); } @Override public BclData next() { if (!hasNext()) { throw new NoSuchElementException(); } return reader.next(); } @Override public boolean hasNext() { try { return reader.hasNext(); } catch (final NullPointerException npe) { return false; } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy