All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.illumina.quality.CollectHiSeqXPfFailMetrics Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2014 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.illumina.quality;

import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Metrics;
import picard.illumina.parser.BaseIlluminaDataProvider;
import picard.illumina.parser.ClusterData;
import picard.illumina.parser.IlluminaDataProviderFactory;
import picard.illumina.parser.IlluminaDataType;
import picard.illumina.parser.ReadData;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
 * Collect metrics regarding the reason for reads (sequenced by HiSeqX) not passing the Illumina PF Filter. (BETA)
 *
 * @author Yossi Farjoun
 */
@CommandLineProgramProperties(
        usage = CollectHiSeqXPfFailMetrics.USAGE_SUMMARY + CollectHiSeqXPfFailMetrics.USAGE_DETAILS,
        usageShort = CollectHiSeqXPfFailMetrics.USAGE_SUMMARY,
        programGroup = Metrics.class
)

public class CollectHiSeqXPfFailMetrics extends CommandLineProgram {
    static final String USAGE_SUMMARY = "Classify PF-Failing reads in a HiSeqX Illumina Basecalling directory into " +
            "various categories.";
    static final String USAGE_DETAILS = "

This tool categorizes the reads that did not pass filter " + "(PF-Failing) into four groups. These groups are based on a heuristic that was derived by looking at a" + " few titration experiments.

" + "" + "

After examining the called bases from the first 24 cycles of each read, the PF-Failed reads " + "are grouped into the following four categories: " + "

    " + "
  • MISALIGNED - The first 24 basecalls of a read are uncalled (numNs~24). " + " These types of reads appear to be flow cell artifacts because reads were only found near tile boundaries " + "and were concentration (library) independent
  • " + "
  • EMPTY - All 24 bases are called (numNs~0) but the number of bases with quality scores" + " greater than two is less than or equal to eight (numQGtTwo<=8). These reads were location independent" + " within the tiles and were inversely proportional to the library concentration
  • " + "
  • POLYCLONAL - All 24 bases were called and numQGtTwo>=12, were independent of their location" + " with the tiles, and were directly proportional to the library concentration. These reads are likely" + " the result of PCR artifacts
  • " + "
  • UNKNOWN - The remaining reads that are PF-Failing but did not fit into any of the groups " + "listed above
  • " + "

"+ "" + "

The tool defaults to the SUMMARY output which indicates the number of PF-Failed reads per tile and" + " groups them into the categories described above accordingly.

" + "

A DETAILED metrics option is also available that subdivides the SUMMARY outputs by the x- y- position" + " of these reads within each tile. To obtain the DETAILED metric table, you must add the " + "PROB_EXPLICIT_READS option to your command line and set the value between 0 and 1. This value represents" + " the fractional probability of PF-Failed reads to send to output. For example, if PROB_EXPLICIT_READS=0, " + "then no metrics will be output. If PROB_EXPLICIT_READS=1, then it will " + "provide detailed metrics for all (100%) of the reads. It follows that setting the " + "PROB_EXPLICIT_READS=0.5, will provide detailed metrics for half of the PF-Failed reads.

"+ "

Note: Metrics labeled as percentages are actually expressed as fractions!

" + "" + "

Usage example: (SUMMARY Metrics)

" + "
" +
            "java -jar picard.jar CollectHiSeqXPfFailMetrics \\
" + " BASECALLS_DIR=/BaseCalls/ \\
" + " OUTPUT=/metrics/ \\
" + " LANE=001" + "
" + "

Usage example: (DETAILED Metrics)

" + "
"+
            "java -jar picard.jar CollectHiSeqXPfFailMetrics \\
" + " BASECALLS_DIR=/BaseCalls/ \\
" + " OUTPUT=/Detail_metrics/ \\
" + " LANE=001 \\
" + " PROB_EXPLICIT_READS=1" + "
" + "" + "Please see our documentation on the " + "SUMMARY" + " and " + "" + "DETAILED " + "metrics for comprehensive explanations of the outputs produced by this tool." + "
"; @Option(doc = "The Illumina basecalls directory. ", shortName = "B") public File BASECALLS_DIR; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Basename for metrics file. Resulting file will be" + " " + summaryMetricsExtension, optional = false) public File OUTPUT; @Option(shortName = "P", doc = "The fraction of (non-PF) reads for which to output explicit classification. Output file will be " + detailedMetricsExtension + " (if PROB_EXPLICIT_READS != 0)", optional = true) public double PROB_EXPLICIT_READS = 0; @Option(doc = "Lane number.", shortName = StandardOptionDefinitions.LANE_SHORT_NAME) public Integer LANE; @Option(shortName = "NP", doc = "Run this many PerTileBarcodeExtractors in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " + "the number of cores available on the machine. If NUM_PROCESSORS < 0 then the number of cores used will be " + "the number available on the machine less NUM_PROCESSORS.", optional = true) public int NUM_PROCESSORS = 1; @Option(doc = "Number of cycles to look at. At time of writing PF status gets determined at cycle 24 so numbers greater than this will yield strange results. " + "In addition, PF status is currently determined at cycle 24, so running this with any other value is neither tested nor recommended.", optional = true) public int N_CYCLES = 24; private static final Log LOG = Log.getInstance(CollectHiSeqXPfFailMetrics.class); private final Map tileToSummaryMetrics = new LinkedHashMap(); private final Map> tileToDetailedMetrics = new LinkedHashMap>(); //Add "T" to the number of cycles to create a "TemplateRead" of the desired length. private final ReadStructure READ_STRUCTURE = new ReadStructure(N_CYCLES + "T"); public final static String detailedMetricsExtension = ".pffail_detailed_metrics"; public final static String summaryMetricsExtension = ".pffail_summary_metrics"; @Override protected String[] customCommandLineValidation() { final List errors = new ArrayList(); if (N_CYCLES < 0) { errors.add("Number of Cycles to look at must be greater than 0"); } if (PROB_EXPLICIT_READS > 1 || PROB_EXPLICIT_READS < 0) { errors.add("PROB_EXPLICIT_READS must be a probability, i.e., 0 <= PROB_EXPLICIT_READS <= 1"); } if (!errors.isEmpty()) { return errors.toArray(new String[errors.size()]); } else { return super.customCommandLineValidation(); } } /** Stock main method. */ public static void main(final String[] args) { new CollectHiSeqXPfFailMetrics().instanceMainWithExit(args); } @Override protected int doWork() { final IlluminaDataProviderFactory factory = new IlluminaDataProviderFactory(BASECALLS_DIR, LANE, READ_STRUCTURE, new BclQualityEvaluationStrategy(BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY), IlluminaDataType.BaseCalls, IlluminaDataType.PF, IlluminaDataType.QualityScores, IlluminaDataType.Position); final File summaryMetricsFileName = new File(OUTPUT + summaryMetricsExtension); final File detailedMetricsFileName = new File(OUTPUT + detailedMetricsExtension); IOUtil.assertFileIsWritable(summaryMetricsFileName); if (PROB_EXPLICIT_READS != 0) { IOUtil.assertFileIsWritable(detailedMetricsFileName); } final int numProcessors; if (NUM_PROCESSORS == 0) { numProcessors = Runtime.getRuntime().availableProcessors(); } else if (NUM_PROCESSORS < 0) { numProcessors = Runtime.getRuntime().availableProcessors() + NUM_PROCESSORS; } else { numProcessors = NUM_PROCESSORS; } // Create thread-pool submit jobs and what for their completion LOG.info("Processing with " + numProcessors + " PerTilePFMetricsExtractor(s)."); final ExecutorService pool = Executors.newFixedThreadPool(numProcessors); final List extractors = new ArrayList(factory.getAvailableTiles().size()); for (final int tile : factory.getAvailableTiles()) { tileToSummaryMetrics.put(tile, new PFFailSummaryMetric(Integer.toString(tile))); tileToDetailedMetrics.put(tile, new ArrayList()); final PerTilePFMetricsExtractor extractor = new PerTilePFMetricsExtractor( tile, tileToSummaryMetrics.get(tile), tileToDetailedMetrics.get(tile), factory, PROB_EXPLICIT_READS ); extractors.add(extractor); } try { for (final PerTilePFMetricsExtractor extractor : extractors) { pool.submit(extractor); } pool.shutdown(); // Wait forever for tasks to terminate pool.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS); } catch (final Throwable e) { // Cancel if current thread also interrupted LOG.error(e, "Parent thread encountered problem submitting extractors to thread pool or awaiting shutdown of threadpool. Attempting to kill threadpool."); pool.shutdownNow(); return 2; } LOG.info("Processed " + extractors.size() + " tiles."); // Check for exceptions from extractors for (final PerTilePFMetricsExtractor extractor : extractors) { if (extractor.getException() != null) { LOG.error("Abandoning metrics calculation because one or more PerTilePFMetricsExtractors failed."); return 4; } } // Add detailed metrics to file final MetricsFile detailedMetrics = getMetricsFile(); for (final Collection detailedMetricCollection : tileToDetailedMetrics.values()) { for (final PFFailDetailedMetric metric : detailedMetricCollection) { detailedMetrics.addMetric(metric); } } // If detailed metrics were requested, write them now. if (PROB_EXPLICIT_READS > 0) { detailedMetrics.write(detailedMetricsFileName); } // Finish metrics tallying. Looping twice so that the "All" metrics will come out on top. final PFFailSummaryMetric totalMetric = new PFFailSummaryMetric("All"); // a "fake" tile that will contain the total tally for (final PFFailSummaryMetric summaryMetric : tileToSummaryMetrics.values()) { totalMetric.merge(summaryMetric); } // Derive fields for total metric and add to file totalMetric.calculateDerivedFields(); final MetricsFile summaryMetricsFile = getMetricsFile(); summaryMetricsFile.addMetric(totalMetric); // Prepare each tile's derived fields and add it to the file for (final PFFailSummaryMetric summaryMetric : tileToSummaryMetrics.values()) { summaryMetric.calculateDerivedFields(); summaryMetricsFile.addMetric(summaryMetric); } // Actually write the summary metrics to their file. summaryMetricsFile.write(summaryMetricsFileName); return 0; } /** Extracts metrics from a HiSeqX tile. */ private static class PerTilePFMetricsExtractor implements Runnable { private final int tile; private final PFFailSummaryMetric summaryMetric; final Collection detailedMetrics; private Exception exception = null; private final BaseIlluminaDataProvider provider; final private double pWriteDetailed; final private Random random = new Random(); /** * Constructor * * @param tile The number of the tile being processed. * @param summaryMetric A summaryMetric for collecting the tile data in. * @param detailedMetrics A set of metrics for collecting the classification data in. * @param factory A dataprovider for IlluminaData */ public PerTilePFMetricsExtractor( final int tile, final PFFailSummaryMetric summaryMetric, final Collection detailedMetrics, final IlluminaDataProviderFactory factory, final double pWriteDetailed ) { this.tile = tile; this.summaryMetric = summaryMetric; this.detailedMetrics = detailedMetrics; this.pWriteDetailed = pWriteDetailed; this.provider = factory.makeDataProvider(Arrays.asList(tile)); } public Exception getException() { return this.exception; } /** run method which extracts accumulates metrics for a tile */ public void run() { try { LOG.info("Extracting PF metrics for tile " + tile); /** * Sometimes makeDataProvider takes a while waiting for slow file IO, for each tile the needed set of files * is non-overlapping sets of files so make the data providers in the individual threads for Extractors * so they are not all waiting for each others file operations */ while (provider.hasNext()) { // Extract the PF status and infer reason if FAIL from the cluster and update the summaryMetric for the tile final ClusterData cluster = provider.next(); this.summaryMetric.READS++; if (!cluster.isPf()) { this.summaryMetric.PF_FAIL_READS++; final ReadClassifier readClassifier = new ReadClassifier(cluster.getRead(0)); if (random.nextDouble() < pWriteDetailed) { detailedMetrics.add(new PFFailDetailedMetric(tile, cluster.getX(), cluster.getY(), readClassifier.numNs, readClassifier.numQGtTwo, readClassifier.failClass)); } switch (readClassifier.failClass) { case EMPTY: this.summaryMetric.PF_FAIL_EMPTY++; break; case MISALIGNED: this.summaryMetric.PF_FAIL_MISALIGNED++; break; case POLYCLONAL: this.summaryMetric.PF_FAIL_POLYCLONAL++; break; case UNKNOWN: this.summaryMetric.PF_FAIL_UNKNOWN++; break; default: LOG.error("Got unexpected fail Reason"); } } } } catch (final Exception e) { LOG.error(e, "Error processing tile ", this.tile); this.exception = e; } finally { provider.close(); } } } protected static class ReadClassifier { public enum PfFailReason { EMPTY, POLYCLONAL, MISALIGNED, UNKNOWN } private final int numNs; // The number of Ns in the base calls private final int numQGtTwo; // The number of quality scores greater than 2 private PfFailReason failClass = null; // The classification of the failure mode /** * Heart of CLP. * This class actually classifies ReadData into the reason why it failed PF * classification is based on a small set of titrated flowcells sequenced at the Broad Institute by the Genomics Platform. * Three cluster were observed: * - numNs~24 and was found only near the boundaries of tiles. it didn't seem to depend on concentration. For this reason it * was classified as MISALIGNED *

* - numNs~0 and numQGtTwo<=8 these were found throughout the tiles and _decreased_ in number as the concentration of the library increased * Thus it was concluded that these correspond to the EMPTY wells *

* - numNs~0 and numQGtTwo>=12 there were found throughout the tiles and _increased_ in number as the concentration of the library increased * Thus it was concluded that these correspond to the POLYCLONAL wells *

* - the remaining reads were few in number the classification for them wasn't clear. Thus they are left as UNKNOWN. *

* We use the length of the read as a parameter and scale the 8 and the 12 accordingly as length/3 and length/2, but in reality this has only * been tested on length=24. * * @param read The read to classify. */ public ReadClassifier(final ReadData read) { final int length = read.getBases().length; numNs = countEquals(read.getBases(), (byte) '.'); // Ns are returned as periods from Illumina numQGtTwo = countGreaterThan(read.getQualities(), (byte) 2); failClass = PfFailReason.UNKNOWN; //for cases not covered below if (numNs >= (length - 1)) { failClass = PfFailReason.MISALIGNED; } else if (numNs <= 1) { if (numQGtTwo <= length / 3) { failClass = PfFailReason.EMPTY; } else if (numQGtTwo >= length / 2) { failClass = PfFailReason.POLYCLONAL; } } } } /** a metric class for describing FP failing reads from an Illumina HiSeqX lane * */ public static class PFFailDetailedMetric extends MetricBase { /** The Tile that is described by this metric */ public Integer TILE; /** The X coordinate of the read within the tile */ public int X; /** The Y coordinate of the read within the tile */ public int Y; /** The number of Ns found in this read */ public int NUM_N; /** The number of Quality scores greater than 2 found in this read */ public int NUM_Q_GT_TWO; /** * The classification of this read: {EMPTY, POLYCLONAL, MISALIGNED, UNKNOWN} * (See PFFailSummaryMetric for explanation regarding the possible classification.) */ public ReadClassifier.PfFailReason CLASSIFICATION; public PFFailDetailedMetric(final Integer TILE, final int x, final int y, final int NUM_N, final int NUM_Q_GT_TWO, final ReadClassifier.PfFailReason CLASSIFICATION) { this.TILE = TILE; X = x; Y = y; this.NUM_N = NUM_N; this.NUM_Q_GT_TWO = NUM_Q_GT_TWO; this.CLASSIFICATION = CLASSIFICATION; } /** This ctor is necessary for when reading metrics from file */ public PFFailDetailedMetric() { } } /** * Metrics produced by the GetHiSeqXPFFailMetrics program. Used to diagnose lanes from HiSeqX * Sequencing, providing the number and fraction of each of the reasons that reads could have not passed PF. * Possible reasons are EMPTY (reads from empty wells with no template strand), POLYCLONAL (reads from wells that had more than one strand * cloned in them), MISALIGNED (reads from wells that are near the edge of the tile), UNKNOWN (reads that didn't pass PF but couldn't be diagnosed) */ public static class PFFailSummaryMetric extends MetricBase { /** The Tile that is described by this metric. Can be a string (like "All") to mean some marginal over tiles. * */ public String TILE = null; /** The total number of reads examined */ public int READS = 0; /** The number of non-PF reads in this tile. */ public int PF_FAIL_READS = 0; /** The fraction of PF_READS */ public double PCT_PF_FAIL_READS = 0.0; /** The number of non-PF reads in this tile that are deemed empty. */ public int PF_FAIL_EMPTY = 0; /** The fraction of non-PF reads in this tile that are deemed empty (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_EMPTY = 0.0; /** The number of non-PF reads in this tile that are deemed multiclonal. */ public int PF_FAIL_POLYCLONAL = 0; /** The fraction of non-PF reads in this tile that are deemed multiclonal (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_POLYCLONAL = 0.0; /** The number of non-PF reads in this tile that are deemed "misaligned". */ public int PF_FAIL_MISALIGNED = 0; /** The fraction of non-PF reads in this tile that are deemed "misaligned" (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_MISALIGNED = 0.0; /** The number of non-PF reads in this tile that have not been classified. */ public int PF_FAIL_UNKNOWN = 0; /** The fraction of non-PF reads in this tile that have not been classified (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_UNKNOWN = 0.0; //Constructor takes a String for tile since we want to have one instance with tile="All". This column will contain the // summary of all the tiles public PFFailSummaryMetric(final String tile) { TILE = tile; } /** This ctor is necessary for when reading metrics from file */ public PFFailSummaryMetric() { } /** * Adds the non-calculated fields from the other metric to this one. * * @param metric */ public void merge(final PFFailSummaryMetric metric) { this.READS += metric.READS; this.PF_FAIL_READS += metric.PF_FAIL_READS; this.PF_FAIL_EMPTY += metric.PF_FAIL_EMPTY; this.PF_FAIL_MISALIGNED += metric.PF_FAIL_MISALIGNED; this.PF_FAIL_POLYCLONAL += metric.PF_FAIL_POLYCLONAL; this.PF_FAIL_UNKNOWN += metric.PF_FAIL_UNKNOWN; } public void calculateDerivedFields() { //protect against divide by zero if (this.READS != 0) { this.PCT_PF_FAIL_READS = (double) this.PF_FAIL_READS / this.READS; this.PCT_PF_FAIL_EMPTY = (double) this.PF_FAIL_EMPTY / this.READS; this.PCT_PF_FAIL_MISALIGNED = (double) this.PF_FAIL_MISALIGNED / this.READS; this.PCT_PF_FAIL_POLYCLONAL = (double) this.PF_FAIL_POLYCLONAL / this.READS; this.PCT_PF_FAIL_UNKNOWN = (double) this.PF_FAIL_UNKNOWN / this.READS; } } } /** * a simple function that counts how many elements in array are equal to 'toCount' * * @param array * @param toCount * @return number of elements in array that == 'toCount' */ static private int countEquals(final byte[] array, final byte toCount) { int count = 0; for (final byte t : array) { if (t == toCount) count++; } return count; } /** * a simple function that counts how many elements in array are greater-than-or-equal-to 'value' * * @param array * @param value * @return number of elements in array that >= 'value' */ static private int countGreaterThan(final byte[] array, final byte value) { int count = 0; for (final int t : array) { if (t > value) count++; } return count; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy