picard.illumina.IlluminaBasecallsToSam Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina;
import htsjdk.samtools.BAMRecordCodec;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordQueryNameComparator;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Iso8601Date;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Illumina;
import picard.illumina.parser.IlluminaFileUtil;
import picard.illumina.parser.ReadStructure;
import picard.illumina.parser.readers.BclQualityEvaluationStrategy;
import picard.util.AdapterPair;
import picard.util.IlluminaUtil;
import picard.util.IlluminaUtil.IlluminaAdapterPair;
import picard.util.TabbedTextFileWithHeaderParser;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* IlluminaBasecallsToSam transforms a lane of Illumina data file formats (bcl, locs, clocs, qseqs, etc.) into
* SAM or BAM file format.
*
* In this application, barcode data is read from Illumina data file groups, each of which is associated with a tile.
* Each tile may contain data for any number of barcodes, and a single barcode's data may span multiple tiles. Once the
* barcode data is collected from files, each barcode's data is written to its own SAM/BAM. The barcode data must be
* written in order; this means that barcode data from each tile is sorted before it is written to file, and that if a
* barcode's data does span multiple tiles, data collected from each tile must be written in the order of the tiles
* themselves.
*
* This class employs a number of private subclasses to achieve this goal. The TileReadAggregator controls the flow
* of operation. It is fed a number of Tiles which it uses to spawn TileReaders. TileReaders are responsible for
* reading Illumina data for their respective tiles from disk, and as they collect that data, it is fed back into the
* TileReadAggregator. When a TileReader completes a tile, it notifies the TileReadAggregator, which reviews what was
* read and conditionally queues its writing to disk, baring in mind the requirements of write-order described in the
* previous paragraph. As writes complete, the TileReadAggregator re-evaluates the state of reads/writes and may queue
* more writes. When all barcodes for all tiles have been written, the TileReadAggregator shuts down.
*
* The TileReadAggregator controls task execution using a specialized ThreadPoolExecutor. It accepts special Runnables
* of type PriorityRunnable which allow a priority to be assigned to the runnable. When the ThreadPoolExecutor is
* assigning threads, it gives priority to those PriorityRunnables with higher priority values. In this application,
* TileReaders are assigned lowest priority, and write tasks are assigned high priority. It is designed in this fashion
* to minimize the amount of time data must remain in memory (write the data as soon as possible, then discard it from
* memory) while maximizing CPU usage.
*
* @author [email protected]
* @author [email protected]
*/
@CommandLineProgramProperties(
usage = IlluminaBasecallsToSam.USAGE_SUMMARY + IlluminaBasecallsToSam.USAGE_DETAILS,
usageShort = IlluminaBasecallsToSam.USAGE_SUMMARY,
programGroup = Illumina.class
)
public class IlluminaBasecallsToSam extends CommandLineProgram {
static final String USAGE_SUMMARY = "Transforms raw Illumina sequencing data into an unmapped SAM or BAM file.";
static final String USAGE_DETAILS = "The IlluminaBaseCallsToSam program collects, demultiplexes, and sorts reads across all " +
"of the tiles of a lane via barcode to produce an unmapped SAM/BAM file. An unmapped BAM file is often referred to as a uBAM. " +
"All barcode, sample, and library data is provided in the LIBRARY_PARAMS file. Note, this LIBRARY_PARAMS file " +
"should be formatted according to the specifications indicated below. The following is an example of a properly" +
" formatted LIBRARY_PARAMS file:
" +
"BARCODE_1\tOUTPUT\tSAMPLE_ALIAS\tLIBRARY_NAME\n" +
"AAAAAAAA\tSA_AAAAAAAA.bam\tSA_AAAAAAAA\tLN_AAAAAAAA\n" +
"AAAAGAAG\tSA_AAAAGAAG.bam\tSA_AAAAGAAG\tLN_AAAAGAAG\n" +
"AACAATGG\tSA_AACAATGG.bam\tSA_AACAATGG\tLN_AACAATGG\n" +
"N\tSA_non_indexed.bam\tSA_non_indexed\tLN_NNNNNNNN\n " +
"" +
"The BARCODES_DIR file is produced by the " +
"ExtractIlluminaBarcodes " +
"tool for each lane of a flow cell.
" +
"Usage example:
" +
"" +
"" +
"java -jar picard.jar IlluminaBasecallsToSam \\
" +
" BASECALLS_DIR=/BaseCalls/ \\
" +
" LANE=001 \\
" +
" READ_STRUCTURE=25T8B25T \\
" +
" RUN_BARCODE=run15 \\
" +
" IGNORE_UNEXPECTED_BARCODES=true \\
" +
" LIBRARY_PARAMS=library.params " +
"
" +
"
";
// The following attributes define the command-line arguments
public static final String USAGE = "Generate a SAM or BAM file from data in an Illumina basecalls output directory";
@Option(doc = "The basecalls directory. ", shortName = "B")
public File BASECALLS_DIR;
@Option(doc = "The barcodes directory with _barcode.txt files (generated by ExtractIlluminaBarcodes). If not set, use BASECALLS_DIR. ", shortName = "BCD", optional = true)
public File BARCODES_DIR;
@Option(doc = "Lane number. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME)
public Integer LANE;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). The output SAM or BAM file. Format is determined by extension.",
shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"})
public File OUTPUT;
@Option(doc = "The barcode of the run. Prefixed to read names.")
public String RUN_BARCODE;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). The name of the sequenced sample",
shortName = StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME,
mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"})
public String SAMPLE_ALIAS;
@Option(doc = "ID used to link RG header record with RG tag in SAM record. " +
"If these are unique in SAM files that get merged, merge performance is better. " +
"If not specified, READ_GROUP_ID will be set to . .",
shortName = StandardOptionDefinitions.READ_GROUP_ID_SHORT_NAME, optional = true)
public String READ_GROUP_ID;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). The name of the sequenced library",
shortName = StandardOptionDefinitions.LIBRARY_NAME_SHORT_NAME,
optional = true,
mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"})
public String LIBRARY_NAME;
@Option(doc = "The name of the sequencing center that produced the reads. Used to set the RG.CN tag.", optional = true)
public String SEQUENCING_CENTER = "BI";
@Option(doc = "The start date of the run.", optional = true)
public Date RUN_START_DATE;
@Option(doc = "The name of the sequencing technology that produced the read.", optional = true)
public String PLATFORM = "illumina";
@Option(doc = ReadStructure.PARAMETER_DOC, shortName = "RS")
public String READ_STRUCTURE;
@Option(doc = "Deprecated (use LIBRARY_PARAMS). Tab-separated file for creating all output BAMs for barcoded run " +
"with single IlluminaBasecallsToSam invocation. Columns are BARCODE, OUTPUT, SAMPLE_ALIAS, and " +
"LIBRARY_NAME. Row with BARCODE=N is used to specify a file for no barcode match",
mutex = {"OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "LIBRARY_PARAMS"})
public File BARCODE_PARAMS;
@Option(doc = "Tab-separated file for creating all output BAMs for a lane with single IlluminaBasecallsToSam " +
"invocation. The columns are OUTPUT, SAMPLE_ALIAS, and LIBRARY_NAME, BARCODE_1, BARCODE_2 ... BARCODE_X " +
"where X = number of barcodes per cluster (optional). Row with BARCODE_1 set to 'N' is used to specify a file " +
"for no barcode match. You may also provide any 2 letter RG header attributes (excluding PU, CN, PL, and" +
" DT) as columns in this file and the values for those columns will be inserted into the RG tag for the" +
" BAM file created for a given row.",
mutex = {"OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "BARCODE_PARAMS"})
public File LIBRARY_PARAMS;
@Option(doc = "Which adapters to look for in the read.")
public List ADAPTERS_TO_CHECK = new ArrayList<>(
Arrays.asList(IlluminaAdapterPair.INDEXED,
IlluminaAdapterPair.DUAL_INDEXED,
IlluminaAdapterPair.NEXTERA_V2,
IlluminaAdapterPair.FLUIDIGM));
@Option(doc = "For specifying adapters other than standard Illumina", optional = true)
public String FIVE_PRIME_ADAPTER;
@Option(doc = "For specifying adapters other than standard Illumina", optional = true)
public String THREE_PRIME_ADAPTER;
@Option(doc = "The number of threads to run in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " +
"the number of cores available on the machine. If NUM_PROCESSORS < 0, then the number of cores used will" +
" be the number available on the machine less NUM_PROCESSORS.")
public Integer NUM_PROCESSORS = 0;
@Option(doc = "If set, this is the first tile to be processed (used for debugging). Note that tiles are not processed" +
" in numerical order.",
optional = true)
public Integer FIRST_TILE;
@Option(doc = "If set, process no more than this many tiles (used for debugging).", optional = true)
public Integer TILE_LIMIT;
@Option(doc = "If true, call System.gc() periodically. This is useful in cases in which the -Xmx value passed " +
"is larger than the available memory.")
public Boolean FORCE_GC = true;
@Option(doc = "Apply EAMSS filtering to identify inappropriately quality scored bases towards the ends of reads" +
" and convert their quality scores to Q2.")
public boolean APPLY_EAMSS_FILTER = true;
@Option(doc = "Configure SortingCollections to store this many records before spilling to disk. For an indexed" +
" run, each SortingCollection gets this value/number of indices.")
public int MAX_READS_IN_RAM_PER_TILE = 1200000;
@Option(doc = "The minimum quality (after transforming 0s to 1s) expected from reads. If qualities are lower than this value, an error is thrown." +
"The default of 2 is what the Illumina's spec describes as the minimum, but in practice the value has been observed lower.")
public int MINIMUM_QUALITY = BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY;
@Option(doc = "Whether to include non-PF reads", shortName = "NONPF", optional = true)
public boolean INCLUDE_NON_PF_READS = true;
@Option(doc = "Whether to ignore reads whose barcodes are not found in LIBRARY_PARAMS. Useful when outputting " +
"BAMs for only a subset of the barcodes in a lane.", shortName = "IGNORE_UNEXPECTED")
public boolean IGNORE_UNEXPECTED_BARCODES = false;
@Option(doc = "The tag to use to store any molecular indexes. If more than one molecular index is found, they will be concatenated and stored here.", optional = true)
public String MOLECULAR_INDEX_TAG = "RX";
@Option(doc = "The tag to use to store any molecular index base qualities. If more than one molecular index is found, their qualities will be concatenated and stored here " +
"(.i.e. the number of \"M\" operators in the READ_STRUCTURE)", optional = true)
public String MOLECULAR_INDEX_BASE_QUALITY_TAG = "QX";
@Option(doc = "The list of tags to store each molecular index. The number of tags should match the number of molecular indexes.", optional = true)
public List TAG_PER_MOLECULAR_INDEX;
private final Map barcodeSamWriterMap = new HashMap<>();
private ReadStructure readStructure;
private BasecallsConverter basecallsConverter;
private static final Log log = Log.getInstance(IlluminaBasecallsToSam.class);
@Override
protected int doWork() {
initialize();
basecallsConverter.doTileProcessing();
return 0;
}
/**
* Prepares loggers, initiates garbage collection thread, parses arguments and initialized variables appropriately/
*/
private void initialize() {
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy = new BclQualityEvaluationStrategy(MINIMUM_QUALITY);
if (OUTPUT != null) {
IOUtil.assertFileIsWritable(OUTPUT);
}
if (LIBRARY_PARAMS != null) {
IOUtil.assertFileIsReadable(LIBRARY_PARAMS);
}
if (OUTPUT != null) {
barcodeSamWriterMap.put(null, buildSamFileWriter(OUTPUT, SAMPLE_ALIAS, LIBRARY_NAME, buildSamHeaderParameters(null), true));
} else {
populateWritersFromLibraryParams();
}
final int numOutputRecords = readStructure.templates.length();
// Combine any adapters and custom adapter pairs from the command line into an array for use in clipping
final List adapters = new ArrayList<>();
adapters.addAll(ADAPTERS_TO_CHECK);
if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
adapters.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
}
if (IlluminaFileUtil.hasCbcls(BASECALLS_DIR, LANE)) {
if (BARCODES_DIR == null) BARCODES_DIR = BASECALLS_DIR;
basecallsConverter = new NewIlluminaBasecallsConverter<>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure,
barcodeSamWriterMap, true, Math.max(1, MAX_READS_IN_RAM_PER_TILE / numOutputRecords),
TMP_DIR, NUM_PROCESSORS,
FIRST_TILE, TILE_LIMIT, new QueryNameComparator(),
new Codec(numOutputRecords),
SAMRecordsForCluster.class, bclQualityEvaluationStrategy, IGNORE_UNEXPECTED_BARCODES);
} else {
basecallsConverter = new IlluminaBasecallsConverter<>(BASECALLS_DIR, BARCODES_DIR, LANE, readStructure,
barcodeSamWriterMap, true, MAX_READS_IN_RAM_PER_TILE / numOutputRecords, TMP_DIR, NUM_PROCESSORS, FORCE_GC,
FIRST_TILE, TILE_LIMIT, new QueryNameComparator(), new Codec(numOutputRecords), SAMRecordsForCluster.class,
bclQualityEvaluationStrategy, APPLY_EAMSS_FILTER, INCLUDE_NON_PF_READS, IGNORE_UNEXPECTED_BARCODES);
}
/*
* Be sure to pass the outputReadStructure to ClusterDataToSamConverter, which reflects the structure of the output cluster
* data which may be different from the input read structure (specifically if there are skips).
*/
final ClusterDataToSamConverter converter = new ClusterDataToSamConverter(RUN_BARCODE, READ_GROUP_ID,
basecallsConverter.getFactory().getOutputReadStructure(), adapters)
.withMolecularIndexTag(MOLECULAR_INDEX_TAG)
.withMolecularIndexQualityTag(MOLECULAR_INDEX_BASE_QUALITY_TAG)
.withTagPerMolecularIndex(TAG_PER_MOLECULAR_INDEX);
basecallsConverter.setConverter(converter);
log.info("DONE_READING STRUCTURE IS " + readStructure.toString());
}
/**
* Assert that expectedCols are present and return actualCols - expectedCols
*
* @param actualCols The columns present in the LIBRARY_PARAMS file
* @param expectedCols The columns that are REQUIRED
* @return actualCols - expectedCols
*/
private Set findAndFilterExpectedColumns(final Set actualCols, final Set expectedCols) {
final Set missingColumns = new HashSet<>(expectedCols);
missingColumns.removeAll(actualCols);
if (!missingColumns.isEmpty()) {
throw new PicardException(String.format(
"LIBRARY_PARAMS file %s is missing the following columns: %s.",
LIBRARY_PARAMS.getAbsolutePath(), StringUtil.join(", ", missingColumns
)));
}
final Set remainingColumns = new HashSet<>(actualCols);
remainingColumns.removeAll(expectedCols);
return remainingColumns;
}
/**
* Given a set of columns assert that all columns conform to the format of an RG header attribute (i.e. 2 letters)
* the attribute is NOT a member of the rgHeaderTags that are built by default in buildSamHeaderParameters
*
* @param rgTagColumns A set of columns that should conform to the rg header attribute format
*/
private void checkRgTagColumns(final Set rgTagColumns) {
final Set forbiddenHeaders = buildSamHeaderParameters(null).keySet();
forbiddenHeaders.retainAll(rgTagColumns);
if (!forbiddenHeaders.isEmpty()) {
throw new PicardException("Illegal ReadGroup tags in library params(barcode params) file(" + LIBRARY_PARAMS.getAbsolutePath() + ") Offending headers = " + StringUtil.join(", ", forbiddenHeaders));
}
for (final String column : rgTagColumns) {
if (column.length() > 2) {
throw new PicardException("Column label (" + column + ") unrecognized. Library params(barcode params) can only contain the columns " +
"(OUTPUT, LIBRARY_NAME, SAMPLE_ALIAS, BARCODE, BARCODE_ where X is a positive integer) OR two letter RG tags!");
}
}
}
/**
* For each line in the LIBRARY_PARAMS file create a SamFileWriter and put it in the barcodeSamWriterMap map, where
* the key to the map is the concatenation of all sampleBarcodes in order for the given line
*/
private void populateWritersFromLibraryParams() {
final TabbedTextFileWithHeaderParser libraryParamsParser = new TabbedTextFileWithHeaderParser(LIBRARY_PARAMS);
final Set expectedColumnLabels = CollectionUtil.makeSet("OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME");
final List barcodeColumnLabels = new ArrayList<>();
if (readStructure.sampleBarcodes.length() == 1) {
//For the single barcode read case, the barcode label name can either by BARCODE or BARCODE_1
if (libraryParamsParser.hasColumn("BARCODE")) {
barcodeColumnLabels.add("BARCODE");
} else if (libraryParamsParser.hasColumn("BARCODE_1")) {
barcodeColumnLabels.add("BARCODE_1");
} else {
throw new PicardException("LIBRARY_PARAMS(BARCODE_PARAMS) file " + LIBRARY_PARAMS + " does not have column BARCODE or BARCODE_1.");
}
} else {
for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) {
barcodeColumnLabels.add("BARCODE_" + i);
}
}
expectedColumnLabels.addAll(barcodeColumnLabels);
final Set rgTagColumns = findAndFilterExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels);
checkRgTagColumns(rgTagColumns);
for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) {
List barcodeValues = null;
if (!barcodeColumnLabels.isEmpty()) {
barcodeValues = new ArrayList<>();
for (final String barcodeLabel : barcodeColumnLabels) {
barcodeValues.add(row.getField(barcodeLabel));
}
}
final String key = (barcodeValues == null || barcodeValues.contains("N")) ? null : StringUtil.join("", barcodeValues);
if (barcodeSamWriterMap.containsKey(key)) { //This will catch the case of having more than 1 line in a non-barcoded LIBRARY_PARAMS file
throw new PicardException("Row for barcode " + key + " appears more than once in LIBRARY_PARAMS or BARCODE_PARAMS file " +
LIBRARY_PARAMS);
}
final Map samHeaderParams = buildSamHeaderParameters(barcodeValues);
for (final String tagName : rgTagColumns) {
samHeaderParams.put(tagName, row.getField(tagName));
}
final SAMFileWriterWrapper writer = buildSamFileWriter(new File(row.getField("OUTPUT")),
row.getField("SAMPLE_ALIAS"), row.getField("LIBRARY_NAME"), samHeaderParams, true);
barcodeSamWriterMap.put(key, writer);
}
if (barcodeSamWriterMap.isEmpty()) {
throw new PicardException("LIBRARY_PARAMS(BARCODE_PARAMS) file " + LIBRARY_PARAMS + " does have any data rows.");
}
libraryParamsParser.close();
}
/**
* Create the list of headers that will be added to the SAMFileHeader for a library with the given sampleBarcodes (or
* the entire run if sampleBarcodes == NULL). Note that any value that is null will NOT be added via buildSamFileWriter
* but is placed in the map in order to be able to query the tags that we automatically add.
*
* @param barcodes The list of sampleBarcodes that uniquely identify the read group we are building parameters for
* @return A Map of ReadGroupHeaderTags -> Values
*/
private Map buildSamHeaderParameters(final List barcodes) {
final Map params = new LinkedHashMap<>();
String platformUnit = RUN_BARCODE + "." + LANE;
if (barcodes != null) platformUnit += ("." + IlluminaUtil.barcodeSeqsToString(barcodes));
params.put("PL", PLATFORM);
params.put("PU", platformUnit);
params.put("CN", SEQUENCING_CENTER);
params.put("DT", RUN_START_DATE == null ? null : new Iso8601Date(RUN_START_DATE).toString());
return params;
}
/**
* Build a SamFileWriter that will write its contents to the output file.
*
* @param output The file to which to write
* @param sampleAlias The sample alias set in the read group header
* @param libraryName The name of the library to which this read group belongs
* @param headerParameters Header parameters that will be added to the RG header for this SamFile
* @return A SAMFileWriter
*/
private SAMFileWriterWrapper buildSamFileWriter(final File output, final String sampleAlias,
final String libraryName, final Map headerParameters,
final boolean presorted) {
IOUtil.assertFileIsWritable(output);
final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID);
rg.setSample(sampleAlias);
if (libraryName != null) rg.setLibrary(libraryName);
for (final Map.Entry tagNameToValue : headerParameters.entrySet()) {
if (tagNameToValue.getValue() != null) {
rg.setAttribute(tagNameToValue.getKey(), tagNameToValue.getValue());
}
}
final SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.queryname);
header.addReadGroup(rg);
return new SAMFileWriterWrapper(new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, output));
}
public static void main(final String[] args) {
System.exit(new IlluminaBasecallsToSam().instanceMain(args));
}
/**
* Put any custom command-line validation in an override of this method.
* clp is initialized at this point and can be used to print usage and access args.
* Any options set by command-line parser can be validated.
*
* @return null if command line is valid. If command line is invalid, returns an array of error message
* to be written to the appropriate place.
*/
@Override
protected String[] customCommandLineValidation() {
if (BARCODE_PARAMS != null) {
LIBRARY_PARAMS = BARCODE_PARAMS;
}
final ArrayList messages = new ArrayList<>();
readStructure = new ReadStructure(READ_STRUCTURE);
if (!readStructure.sampleBarcodes.isEmpty() && LIBRARY_PARAMS == null) {
messages.add("BARCODE_PARAMS or LIBRARY_PARAMS is missing. If READ_STRUCTURE contains a B (barcode)" +
" then either LIBRARY_PARAMS or BARCODE_PARAMS(deprecated) must be provided!");
}
if (READ_GROUP_ID == null) {
READ_GROUP_ID = RUN_BARCODE.substring(0, 5) + "." + LANE;
}
if (!TAG_PER_MOLECULAR_INDEX.isEmpty() && TAG_PER_MOLECULAR_INDEX.size() != readStructure.molecularBarcode.length()) {
messages.add("The number of tags given in TAG_PER_MOLECULAR_INDEX does not match the number of molecular indexes in READ_STRUCTURE");
}
if ((FIVE_PRIME_ADAPTER == null) != (THREE_PRIME_ADAPTER == null)) {
messages.add("THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must either both be null or both be set.");
}
if (messages.isEmpty()) {
return null;
}
return messages.toArray(new String[messages.size()]);
}
private static final class SAMFileWriterWrapper
implements BasecallsConverter.ConvertedClusterDataWriter {
public final SAMFileWriter writer;
private SAMFileWriterWrapper(final SAMFileWriter writer) {
this.writer = writer;
}
@Override
public void write(final SAMRecordsForCluster records) {
for (final SAMRecord rec : records.records) {
writer.addAlignment(rec);
}
}
@Override
public void close() {
writer.close();
}
}
static class SAMRecordsForCluster {
final SAMRecord[] records;
SAMRecordsForCluster(final int numRecords) {
records = new SAMRecord[numRecords];
}
}
static class QueryNameComparator implements Comparator {
private final SAMRecordQueryNameComparator comparator = new SAMRecordQueryNameComparator();
@Override
public int compare(final SAMRecordsForCluster s1, final SAMRecordsForCluster s2) {
return comparator.compare(s1.records[0], s2.records[0]);
}
}
static class Codec implements SortingCollection.Codec {
private final BAMRecordCodec bamCodec;
private final int numRecords;
Codec(final int numRecords, final BAMRecordCodec bamCodec) {
this.numRecords = numRecords;
this.bamCodec = bamCodec;
}
Codec(final int numRecords) {
this(numRecords, new BAMRecordCodec(null));
}
@Override
public void setOutputStream(final OutputStream os) {
bamCodec.setOutputStream(os);
}
@Override
public void setInputStream(final InputStream is) {
bamCodec.setInputStream(is);
}
@Override
public void encode(final SAMRecordsForCluster val) {
if (val.records.length != numRecords) {
throw new IllegalStateException(String.format("Expected number of clusters %d != actual %d",
numRecords, val.records.length));
}
for (final SAMRecord rec : val.records) {
bamCodec.encode(rec);
}
}
@Override
public SAMRecordsForCluster decode() {
final SAMRecord zerothRecord = bamCodec.decode();
if (zerothRecord == null) return null;
final SAMRecordsForCluster ret = new SAMRecordsForCluster(numRecords);
ret.records[0] = zerothRecord;
for (int i = 1; i < numRecords; ++i) {
ret.records[i] = bamCodec.decode();
if (ret.records[i] == null) {
throw new IllegalStateException(String.format("Expected to read %d records but read only %d", numRecords, i));
}
}
return ret;
}
@Override
public SortingCollection.Codec clone() {
return new Codec(numRecords, bamCodec.clone());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy