All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.illumina.CollectIlluminaBasecallingMetrics Maven / Gradle / Ivy

Go to download

A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2011 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.illumina;

import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.StringUtil;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Argument;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.BaseCallingProgramGroup;
import picard.illumina.parser.BaseIlluminaDataProvider;
import picard.illumina.parser.ClusterData;
import picard.illumina.parser.IlluminaDataProviderFactory;
import picard.illumina.parser.IlluminaDataType;
import picard.illumina.parser.IlluminaFileUtil;
import picard.illumina.parser.ParameterizedFileUtil;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.AbstractIlluminaPositionFileReader;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.illumina.parser.readers.LocsFileReader;
import picard.util.TabbedTextFileWithHeaderParser;

import java.io.File;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;


/**
 * A Command line tool to collect Illumina Basecalling metrics for a sequencing run
 * Requires a Lane and an input file of Barcodes to expect.
 * Outputs metrics:
 * 
    *
  • Mean Clusters Per Tile
  • *
  • Standard Deviation of Clusters Per Tile
  • *
  • Mean Pf Clusters Per Tile
  • *
  • Standard Deviation of Pf Clusters Per Tile
  • *
  • Mean Percentage of Pf Clusters Per Tile
  • *
  • Standard Deviation of Percentage of Pf Clusters Per Tile
  • *
*/ @CommandLineProgramProperties( summary = CollectIlluminaBasecallingMetrics.USAGE_SUMMARY + CollectIlluminaBasecallingMetrics.USAGE_DETAILS, oneLineSummary = CollectIlluminaBasecallingMetrics.USAGE_SUMMARY, programGroup = BaseCallingProgramGroup.class ) @DocumentedFeature public class CollectIlluminaBasecallingMetrics extends CommandLineProgram { static final String USAGE_SUMMARY = "Collects Illumina Basecalling metrics for a sequencing run. "; static final String USAGE_DETAILS = "

This tool will produce per-barcode and per-lane basecall metrics for each sequencing run. " + "Mean values for each metric are determined using data from all of the tiles. This tool requires the following data, LANE(#), " + "BASECALLS_DIR, READ_STRUCTURE, and an input file listing the sample barcodes. " + "Program will provide metrics including: the total numbers of bases, reads, and clusters, as well as the fractions of each " + "bases, reads, and clusters that passed Illumina quality filters (PF) both per barcode and per lane. " + "For additional information on Illumina's PF quality metric, please see the corresponding " + "GATK Dictionary entry.

" + "

The input barcode_list.txt file is a file containing all of the sample and molecular barcodes and can be obtained from the " + "ExtractIlluminaBarcodes " + "tool.

" + "" + "Note: Metrics labeled as percentages are actually expressed as fractions! " + "" + "

Usage example:

" + "
" +
            "java -jar picard.jar CollectIlluminaBasecallingMetrics \\
" + " BASECALLS_DIR=/BaseCalls/ \\
" + " LANE=001 \\
" + " READ_STRUCTURE=25T8B25T \\
" + " INPUT=barcode_list.txt " + "
" + "

Please see the CollectIlluminaBasecallingMetrics " + "definitions " + "for a complete description of the metrics produced by this tool.

" + "
"; //Command Line Arguments @Argument(doc = "The Illumina basecalls output directory from which data are read", shortName = "B") public File BASECALLS_DIR; @Argument(doc = "The barcodes directory with _barcode.txt files (generated by ExtractIlluminaBarcodes). If not set, use BASECALLS_DIR. ", shortName = "BCD", optional = true) public File BARCODES_DIR; @Argument(doc = "The lane whose data will be read", shortName = StandardOptionDefinitions.LANE_SHORT_NAME) public Integer LANE; // TODO: No longer optional after old workflows are through @Argument(doc = "The file containing barcodes to expect from the run - barcodeData.#", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, optional = true) public File INPUT; @Argument(doc = ReadStructure.PARAMETER_DOC, shortName = "RS") public String READ_STRUCTURE; @Argument(doc = "The file to which the collected metrics are written", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional = true) public File OUTPUT; private int barcodeLength = 0; private String unmatchedBarcode; private final SortedMap barcodeToMetricCounts; private static final String BARCODE_NAME_COLUMN = "barcode_name"; private static final String BARCODE_SEQUENCE_COLUMN_NAME_STUB = "barcode_sequence_"; public CollectIlluminaBasecallingMetrics() { this.barcodeToMetricCounts = new TreeMap<>(); } @Override protected int doWork() { // File and Directory Validation IOUtil.assertDirectoryIsReadable(BASECALLS_DIR); if (OUTPUT == null) OUTPUT = new File(BASECALLS_DIR, String.format("LANE%s_basecalling_metrics", LANE)); IOUtil.assertFileIsWritable(OUTPUT); final IlluminaDataProviderFactory factory; final ReadStructure readStructure = new ReadStructure(READ_STRUCTURE); final BclQualityEvaluationStrategy bclQualityEvaluationStrategy = new BclQualityEvaluationStrategy(BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY); if (INPUT == null) { // TODO: Legacy support. Remove when INPUT is required, after all old workflows are through factory = new IlluminaDataProviderFactory(BASECALLS_DIR, LANE, readStructure, bclQualityEvaluationStrategy, IlluminaDataType.PF, IlluminaDataType.Position); } else { // Grab expected barcode data from barcodeData. IOUtil.assertFileIsReadable(INPUT); final TabbedTextFileWithHeaderParser barcodesParser = new TabbedTextFileWithHeaderParser(INPUT); for (final TabbedTextFileWithHeaderParser.Row row : barcodesParser) { final String barcodeName = row.getField(BARCODE_NAME_COLUMN); final StringBuilder barcode = new StringBuilder(); for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) { barcode.append(row.getField(BARCODE_SEQUENCE_COLUMN_NAME_STUB + i)); if (barcodeLength == 0) barcodeLength = barcode.length(); } // Only add the barcode to the hash if it has sequences. For libraries // that don't have barcodes this won't be set in the file. if (barcode.length() > 0) { barcodeToMetricCounts.put(barcode.toString(), new IlluminaMetricCounts(barcode.toString(), barcodeName, LANE)); } } if (IlluminaFileUtil.hasCbcls(BASECALLS_DIR, LANE)) { factory = new IlluminaDataProviderFactory(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure, bclQualityEvaluationStrategy); } else { factory = barcodeToMetricCounts.isEmpty() ? new IlluminaDataProviderFactory( BASECALLS_DIR, BARCODES_DIR, LANE, readStructure, bclQualityEvaluationStrategy, IlluminaDataType.PF, IlluminaDataType.Position) : new IlluminaDataProviderFactory( BASECALLS_DIR, BARCODES_DIR, LANE, readStructure, bclQualityEvaluationStrategy, IlluminaDataType.PF, IlluminaDataType.Position, IlluminaDataType.Barcodes); } } unmatchedBarcode = StringUtil.repeatCharNTimes('N', barcodeLength); //Initialize data provider, iterate over clusters, and collect statistics if (IlluminaFileUtil.hasCbcls(BASECALLS_DIR, LANE)) { setupNewDataProvider(factory); } else { final BaseIlluminaDataProvider provider = factory.makeDataProvider(); while (provider.hasNext()) { final ClusterData cluster = provider.next(); addCluster(cluster); } } onComplete(); return 0; } private void setupNewDataProvider(final IlluminaDataProviderFactory factory) { if (BARCODES_DIR == null) BARCODES_DIR = BASECALLS_DIR; final File laneDir = new File(BASECALLS_DIR, IlluminaFileUtil.longLaneStr(LANE)); final File[] cycleDirs = IOUtil.getFilesMatchingRegexp(laneDir, IlluminaFileUtil.CYCLE_SUBDIRECTORY_PATTERN); //CBCLs final List cbcls = Arrays.stream(cycleDirs) .flatMap(cycleDir -> Arrays.stream(IOUtil.getFilesMatchingRegexp(cycleDir, "^" + IlluminaFileUtil.longLaneStr(LANE) + "_(\\d{1,5}).cbcl$"))).collect(Collectors.toList()); if (cbcls.size() == 0) { throw new PicardException("No CBCL files found."); } IOUtil.assertFilesAreReadable(cbcls); //locs final List locs = new ArrayList<>(); final File locsFile = new File(BASECALLS_DIR.getParentFile(), AbstractIlluminaPositionFileReader.S_LOCS_FILE); IOUtil.assertFileIsReadable(locsFile); try (LocsFileReader locsFileReader = new LocsFileReader(locsFile)) { while (locsFileReader.hasNext()) { locs.add(locsFileReader.next()); } } //filter final Pattern laneTileRegex = Pattern.compile(ParameterizedFileUtil.escapePeriods( ParameterizedFileUtil.makeLaneTileRegex(".filter", LANE))); final File[] filterFiles = NewIlluminaBasecallsConverter.getTiledFiles(laneDir, laneTileRegex); IOUtil.assertFilesAreReadable(Arrays.asList(filterFiles)); final Pattern barcodeRegex = Pattern.compile(ParameterizedFileUtil.escapePeriods( ParameterizedFileUtil.makeBarcodeRegex(LANE))); final Map barcodesFiles = new HashMap<>(); for (final File barcodeFile : NewIlluminaBasecallsConverter.getTiledFiles(BARCODES_DIR, barcodeRegex)) { final Matcher tileMatcher = barcodeRegex.matcher(barcodeFile.getName()); if (tileMatcher.matches()) { IOUtil.assertFileIsReadable(barcodeFile); barcodesFiles.put(Integer.valueOf(tileMatcher.group(1)), barcodeFile); } } factory.getAvailableTiles().forEach(tile -> { final File barcodeFile = barcodesFiles.get(tile); final BaseIlluminaDataProvider provider = factory.makeDataProvider(cbcls, locs, filterFiles, tile, barcodeFile); while (provider.hasNext()) { addCluster(provider.next()); } }); } /*** * Process new cluster of Illumina data - increment a running counter of data */ private void addCluster(final ClusterData cluster) { //compute hash of Barcode and Lane for key String barcode = cluster.getMatchedBarcode(); if (barcode == null) barcode = unmatchedBarcode; //increment counts IlluminaMetricCounts counters = barcodeToMetricCounts.get(barcode); if (counters == null) { counters = new IlluminaMetricCounts(barcode, null, LANE); barcodeToMetricCounts.put(barcode, counters); } final int tileNumber = cluster.getTile(); counters.incrementClusterCount(tileNumber, cluster.isPf()); } /** * Handles completion of metric collection. Metrics are computed from counts of data and written out to a file. */ private void onComplete() { try { final MetricsFile> file = getMetricsFile(); final IlluminaMetricCounts allLaneCounts = new IlluminaMetricCounts(null, null, LANE); for (final String s : barcodeToMetricCounts.keySet()) { final IlluminaMetricCounts counts = barcodeToMetricCounts.get(s); counts.addMetricsToFile(file); allLaneCounts.addIlluminaMetricCounts(counts); } if (!barcodeToMetricCounts.keySet().contains("")) { allLaneCounts.addMetricsToFile(file); // detect non-indexed case } file.write(OUTPUT); } catch (final Exception ex) { throw new PicardException("Error writing output file " + OUTPUT.getPath(), ex); } } public static void main(final String[] argv) { new CollectIlluminaBasecallingMetrics().instanceMainWithExit(argv); } /*** * This class manages counts of Illumina Basecalling data on a Per Barcode Per Lane basis. Cluster and PFCluster * counts are stored per tile number. */ private class IlluminaMetricCounts { /*** Stores counts of clusters found for a specific Barcode-Lane combination across all tiles. Key = Tile Number, Value = count of clusters***/ private final Histogram tileToClusterHistogram; /*** Stores counts of pf clusters found for a specific Barcode-Lane combination across all tiles. Key = Tile Number, Value = count of clusters***/ private final Histogram tileToPfClusterHistogram; final IlluminaBasecallingMetrics metrics; public IlluminaMetricCounts(final String barcode, final String barcodeName, final Integer laneNumber) { this.tileToClusterHistogram = new Histogram<>(); this.tileToPfClusterHistogram = new Histogram<>(); this.metrics = new IlluminaBasecallingMetrics(); this.metrics.MOLECULAR_BARCODE_SEQUENCE_1 = barcode; this.metrics.MOLECULAR_BARCODE_NAME = barcodeName; this.metrics.LANE = Integer.toString(laneNumber); } /* Increments cluster count by 1 for a given tile number */ public void incrementClusterCount(final int tileNumber, final boolean isPf) { incrementClusterCount(tileNumber, 1d, isPf); } /* Increments cluster count by an amount for a given tile number */ public void incrementClusterCount(final int tileNumber, final double incrementAmount, final boolean isPf) { incrementClusterCount(tileNumber, incrementAmount, (isPf ? 1d : 0d)); } /* Increments cluster count by an amount for a given tile number */ public void incrementClusterCount(final Integer tileNumber, final double incrementAmount, final double pfIncrementAmount) { tileToClusterHistogram.increment(tileNumber, incrementAmount); tileToPfClusterHistogram.increment(tileNumber, pfIncrementAmount); } /* Handles calculating final metrics and updating the metric object */ private void onComplete() { final double meanClustersPerTile = tileToClusterHistogram.getMeanBinSize(); metrics.MEAN_CLUSTERS_PER_TILE = Math.round(meanClustersPerTile); metrics.SD_CLUSTERS_PER_TILE = Math.round(tileToClusterHistogram.getStandardDeviationBinSize(meanClustersPerTile)); final double meanPfClustersPerTile = tileToPfClusterHistogram.getMeanBinSize(); metrics.MEAN_PF_CLUSTERS_PER_TILE = Math.round(meanPfClustersPerTile); metrics.SD_PF_CLUSTERS_PER_TILE = Math.round(tileToPfClusterHistogram.getStandardDeviationBinSize(meanPfClustersPerTile)); final DecimalFormat decFormat = new DecimalFormat("#.##"); final Histogram laneToPctPfClusterHistogram = tileToPfClusterHistogram.divideByHistogram(tileToClusterHistogram); final double meanPctPfClustersPerTile = laneToPctPfClusterHistogram.getMeanBinSize(); metrics.MEAN_PCT_PF_CLUSTERS_PER_TILE = (Double.isNaN(meanPctPfClustersPerTile) ? 0 : Double.valueOf(decFormat.format(meanPctPfClustersPerTile * 100))); metrics.SD_PCT_PF_CLUSTERS_PER_TILE = Double.valueOf(decFormat.format(laneToPctPfClusterHistogram.getStandardDeviationBinSize(meanPctPfClustersPerTile) * 100)); metrics.TOTAL_CLUSTERS = (long) this.tileToClusterHistogram.getSumOfValues(); metrics.PF_CLUSTERS = (long) this.tileToPfClusterHistogram.getSumOfValues(); final ReadStructure readStructure = new ReadStructure(READ_STRUCTURE); int templateBaseCountPerCluster = 0; for (int i = 0; i < readStructure.templates.length(); i++) { templateBaseCountPerCluster += readStructure.templates.get(i).length; } metrics.TOTAL_READS = metrics.TOTAL_CLUSTERS * readStructure.templates.length(); metrics.PF_READS = metrics.PF_CLUSTERS * readStructure.templates.length(); metrics.TOTAL_BASES = metrics.TOTAL_CLUSTERS * templateBaseCountPerCluster; metrics.PF_BASES = metrics.PF_CLUSTERS * templateBaseCountPerCluster; } /* Computes final metric based on data counts and writes to output metric file */ public void addMetricsToFile(final MetricsFile> file) { onComplete(); file.addMetric(metrics); } /* Merges data from another IlluminaMetricCount object into current one.*/ public void addIlluminaMetricCounts(final IlluminaMetricCounts counts) { this.tileToClusterHistogram.addHistogram(counts.tileToClusterHistogram); this.tileToPfClusterHistogram.addHistogram(counts.tileToPfClusterHistogram); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy