Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina.quality;
import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Metrics;
import picard.illumina.parser.BaseIlluminaDataProvider;
import picard.illumina.parser.ClusterData;
import picard.illumina.parser.IlluminaDataProviderFactory;
import picard.illumina.parser.IlluminaDataType;
import picard.illumina.parser.ReadData;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/**
* Collect metrics regarding the reason for reads (sequenced by HiSeqX) not passing the Illumina PF Filter. (BETA)
*
* @author Yossi Farjoun
*/
@CommandLineProgramProperties(
usage = CollectHiSeqXPfFailMetrics.USAGE_SUMMARY + CollectHiSeqXPfFailMetrics.USAGE_DETAILS,
usageShort = CollectHiSeqXPfFailMetrics.USAGE_SUMMARY,
programGroup = Metrics.class
)
public class CollectHiSeqXPfFailMetrics extends CommandLineProgram {
static final String USAGE_SUMMARY = "Classify PF-Failing reads in a HiSeqX Illumina Basecalling directory into " +
"various categories.";
static final String USAGE_DETAILS = "
This tool categorizes the reads that did not pass filter " +
"(PF-Failing) into four groups. These groups are based on a heuristic that was derived by looking at a" +
" few titration experiments.
" +
"" +
"
After examining the called bases from the first 24 cycles of each read, the PF-Failed reads " +
"are grouped into the following four categories: " +
"
" +
"
MISALIGNED - The first 24 basecalls of a read are uncalled (numNs~24). " +
" These types of reads appear to be flow cell artifacts because reads were only found near tile boundaries " +
"and were concentration (library) independent
" +
"
EMPTY - All 24 bases are called (numNs~0) but the number of bases with quality scores" +
" greater than two is less than or equal to eight (numQGtTwo<=8). These reads were location independent" +
" within the tiles and were inversely proportional to the library concentration
" +
"
POLYCLONAL - All 24 bases were called and numQGtTwo>=12, were independent of their location" +
" with the tiles, and were directly proportional to the library concentration. These reads are likely" +
" the result of PCR artifacts
" +
"
UNKNOWN - The remaining reads that are PF-Failing but did not fit into any of the groups " +
"listed above
" +
"
"+
"" +
"
The tool defaults to the SUMMARY output which indicates the number of PF-Failed reads per tile and" +
" groups them into the categories described above accordingly.
" +
"
A DETAILED metrics option is also available that subdivides the SUMMARY outputs by the x- y- position" +
" of these reads within each tile. To obtain the DETAILED metric table, you must add the " +
"PROB_EXPLICIT_READS option to your command line and set the value between 0 and 1. This value represents" +
" the fractional probability of PF-Failed reads to send to output. For example, if PROB_EXPLICIT_READS=0, " +
"then no metrics will be output. If PROB_EXPLICIT_READS=1, then it will " +
"provide detailed metrics for all (100%) of the reads. It follows that setting the " +
"PROB_EXPLICIT_READS=0.5, will provide detailed metrics for half of the PF-Failed reads.
"+
"
Note: Metrics labeled as percentages are actually expressed as fractions!
" +
"" +
"Please see our documentation on the " +
"SUMMARY" +
" and " +
"" +
"DETAILED " +
"metrics for comprehensive explanations of the outputs produced by this tool." +
"";
@Option(doc = "The Illumina basecalls directory. ", shortName = "B")
public File BASECALLS_DIR;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Basename for metrics file. Resulting file will be" +
"