org.broadinstitute.hellbender.tools.spark.RevertSamSpark Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.spark;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import htsjdk.samtools.*;
import htsjdk.samtools.util.*;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.table.TableCodec;
import org.broadinstitute.hellbender.utils.codecs.table.TableFeature;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.spark.SparkUtils;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import scala.Tuple2;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
/**
* Reverts a SAM file by optionally restoring original quality scores and by removing
* all alignment information.
*
*
* This tool removes or restores certain properties of the SAM records, including alignment information.
* It can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of
* restoring the original quality scores of a BAM file that has already undergone base quality score recalibration
* (BQSR) if the original qualities were retained during the calibration (in the OQ tag).
*
*
Usage Examples
* Output to a single file
*
* gatk RevertSamSpark \\
* -I input.bam \\
* -O reverted.bam
*
*
*
Output by read group into multiple files with sample map
*
* gatk RevertSamSpark \\
* -I input.bam \\
* --output-by-readgroup \\
* --output-map reverted_bam_paths.tsv
*
*
*
Output by read group with no output map
*
* gatk RevertSamSpark \\
* -I input.bam \\
* --output-by-readgroup \\
* -O /write/reverted/read/group/bams/in/this/dir
*
* This will output a BAM (Can be overridden with outputByReadgroupFileFormat option.)
*
* Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to
* LENIENT or SILENT if the failures are expected to be obviated by the reversion process
* (e.g. invalid alignment information will be obviated when the keepAlignmentInformation option is used).
*/
@DocumentedFeature
@CommandLineProgramProperties(
summary = RevertSamSpark.USAGE_DETAILS,
oneLineSummary = RevertSamSpark.USAGE_SUMMARY,
programGroup = ReadDataManipulationProgramGroup.class)
@BetaFeature
public class RevertSamSpark extends GATKSparkTool {
private static final long serialVersionUID = 1L;
static final String USAGE_SUMMARY = "Reverts SAM, BAM or CRAM files to a previous state.";
static final String USAGE_DETAILS = "This tool removes or restores certain properties of the SAM records, including alignment " +
"information, which can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of " +
"restoring the original quality scores of a BAM file that has already undergone base quality score recalibration (BQSR) if the" +
"original qualities were retained.\n" +
"Examples
\n" +
"Example with single output:
\n" +
"gatk RevertSamSpark \\\n" +
" -I input.bam \\\n" +
" -O reverted.bam\n" +
"\n" +
"Example outputting by read group with output map:
\n" +
"gatk RevertSamSpark \\\n" +
" -I input.bam \\\n" +
" --output-by-readgroup \\\n" +
" --output-map reverted_bam_paths.tsv\n" +
"\n" +
"Will output a BAM/CRAM/SAM file per read group.\n" +
"Example outputting by read group without output map:
\n" +
"gatk RevertSamSpark \\\n" +
" -I input.bam \\\n" +
" --output-by-readgroup \\\n" +
" -O /write/reverted/read/group/bams/in/this/dir\n" +
"\n" +
"Will output a BAM file per read group." +
" Output format can be overridden with the outputByReadgroupFileFormat option.\n" +
"Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to " +
"LENIENT or SILENT if the failures are expected to be obviated by the reversion process " +
"(e.g. invalid alignment information will be obviated when the keep-alignment-information option is used).\n" +
"";
public static final String OUTPUT_MAP_READ_GROUP_FIELD_NAME = "READ_GROUP_ID";
public static final String OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME = "OUTPUT";
@Override
public boolean requiresReads() { return true; }
@Argument(mutex = {OUTPUT_MAP_LONG_NAME}, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
doc = "The output SAM/BAM/CRAM file to create, or an output directory if '--output-by-readgroup' is set.")
public String output;
public static final String OUTPUT_MAP_LONG_NAME = "output-map";
@Argument(mutex = {StandardArgumentDefinitions.OUTPUT_LONG_NAME},
fullName = OUTPUT_MAP_LONG_NAME,
doc = "Tab separated file with two columns, OUTPUT_MAP_READ_GROUP_FIELD_NAME and OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME, providing file mapping only used if '--output-by-readgroup' is set.")
public String outputMap;
public static final String OUTPUT_BY_READGROUP_LONG_NAME = "output-by-readgroup";
@Argument(fullName = OUTPUT_BY_READGROUP_LONG_NAME, doc = "When true, outputs each read group in a separate file.")
public boolean outputByReadGroup = false;
@Argument(doc = "WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " +
"a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " +
"mates, duplicated records, records with mismatches in length of bases and qualities. This option should " +
"only be enabled if the output sort order is queryname and will always cause sorting to occur.")
public boolean sanitize = false;
public static final String KEEP_FIRST_DUPLICATE_LONG_NAME = "keep-first-duplicate";
@Argument(doc = "If 'sanitize' only one record when we find more than one record with the same name for R1/R2/unpaired reads respectively. " +
"For paired end reads, keeps only the first R1 and R2 found respectively, and discards all unpaired reads. " +
"Duplicates do not refer to the duplicate flag in the FLAG field, but instead reads with the same name.",
fullName = KEEP_FIRST_DUPLICATE_LONG_NAME)
public boolean keepFirstDuplicate = false;
public static final String OUTPUT_BY_READGROUP_FILE_FORMAT_LONG_NAME = "output-by-readgroup-file-format";
@Argument(fullName = OUTPUT_BY_READGROUP_FILE_FORMAT_LONG_NAME, doc = "When using --output-by-readgroup, the output file format can be set to a certain format.")
public FileType outputByReadgroupFileFormat = FileType.dynamic;
@Argument(shortName = StandardArgumentDefinitions.SORT_ORDER_SHORT_NAME, fullName = StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, doc = "The sort order to create the reverted output file with, defaults to whatever is specified in the current file", optional = true)
public SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.queryname;
public static final String DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME = "dont-restore-original-qualities";
@Argument( fullName = DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME, doc = "Set to prevent the tool from setting the OQ field to the QUAL where available.", optional = true)
public boolean dontRestoreOriginalQualities = false;
public static final String DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME = "remove-duplicate-information";
@Argument(fullName = DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME, doc = "By default we remove duplicate read flags from all reads. Note that if this is true, " +
" the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.")
public boolean keepDuplicateInformation = false;
public static final String KEEP_ALIGNMENT_INFORMATION = "keep-alignment-information";
@Argument(fullName = KEEP_ALIGNMENT_INFORMATION, doc = "Don't remove any of the alignment information from the file.")
public boolean keepAlignmentInformation = false;
public static final String ATTRIBUTE_TO_CLEAR_LONG_NAME = "attributes-to-clear";
@Argument(fullName = ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true)
public Set attributesToClear = new HashSet();
public static final String REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_LONG_NAME = "remove-default-attributes-to-clear";
@Argument(fullName = REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true)
public boolean removeDefaults = false;
public static final String SAMPLE_ALIAS_ARG = "sample-alias";
@Argument(fullName = SAMPLE_ALIAS_ARG, doc = "The sample alias to use in the reverted output file. This will override the existing " +
"sample alias in the file and is used only if all the read groups in the input file have the " +
"same sample alias.", shortName = StandardArgumentDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional = true)
public String sampleAlias;
public static final String LIBRARY_NAME_ARG = "library-name";
@Argument(fullName = LIBRARY_NAME_ARG, doc = "The library name to use in the reverted output file. This will override the existing " +
"sample alias in the file and is used only if all the read groups in the input file have the " +
"same library name.", optional = true)
public String libraryName;
@Override
public List getDefaultReadFilters() {
return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS);
}
final public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){
private static final long serialVersionUID = 1L;{
add(SAMTag.NM.name());
add(SAMTag.UQ.name());
add(SAMTag.PG.name());
add(SAMTag.MD.name());
add(SAMTag.MQ.name());
add(SAMTag.SA.name()); // Supplementary alignment metadata
add(SAMTag.MC.name()); // Mate Cigar
add(SAMTag.AS.name());
}});
public enum FileType implements CommandLineParser.ClpEnum {
sam("Generate SAM files."),
bam("Generate BAM files."),
cram("Generate CRAM files."),
dynamic("Generate files based on the extension of input.");
final String description;
FileType(String description) {
this.description = description;
}
@Override
public String getHelpDoc() {
return description;
}
}
/**
* Enforce that output ordering is queryname when sanitization is turned on since it requires a queryname sort.
* Also checks to ensure that the user has chosen a valid subset of arguments pertaining to output and sanitization.
*/
@Override
protected String[] customCommandLineValidation() {
final List errors = new ArrayList<>();
validateOutputParams(outputByReadGroup, output, outputMap);
if (!sanitize && keepFirstDuplicate) {
errors.add("'keepFirstDuplicate' cannot be used without 'sanitize'");
}
if (!errors.isEmpty()) {
return errors.toArray(new String[errors.size()]);
}
return null;
}
@Override
protected void runTool(JavaSparkContext ctx) {
Broadcast headerBroadcast = ctx.broadcast(getHeaderForReads());
JavaRDD reads = getReads();
////////////////////////////////////////////////////////////////////////////
// Grab the input header and remap values where appropriate
////////////////////////////////////////////////////////////////////////////
SAMFileHeader localHeader = headerBroadcast.getValue();
validateHeaderOverrides(localHeader, sampleAlias, libraryName);
if (sampleAlias != null) {
localHeader.getReadGroups().forEach(rg -> rg.setSample(sampleAlias));
}
if (libraryName != null) {
localHeader.getReadGroups().forEach(rg -> rg.setLibrary(libraryName));
}
////////////////////////////////////////////////////////////////////////////
// Map the readgroups in the header to appropriate
////////////////////////////////////////////////////////////////////////////
Map writerMap = getOutputMap(outputMap,
output,
getDefaultExtension(readArguments.getReadPathSpecifiers().get(0), outputByReadgroupFileFormat),
localHeader.getReadGroups(),
outputByReadGroup);
////////////////////////////////////////////////////////////////////////////
// Construct appropriate headers for the output files
////////////////////////////////////////////////////////////////////////////
final Map headerMap = getReadGroupHeaderMap(localHeader, writerMap);
// Revert the reads based on the given attributes
List attributesToRevert = removeDefaults ? DEFAULT_ATTRIBUTES_TO_CLEAR : new ArrayList<>();
attributesToRevert.addAll(attributesToClear);
JavaRDD readsReverted = revertReads(reads, attributesToRevert);
////////////////////////////////////////////////////////////////////////////
// Sanitize the reads, sorting them into appropriate order if necessary
////////////////////////////////////////////////////////////////////////////
if (sanitize) {
Map readGroupFormatMap = createReadGroupFormatMap(readsReverted, headerBroadcast, !dontRestoreOriginalQualities);
readsReverted = sanitize(readGroupFormatMap, readsReverted, localHeader, keepFirstDuplicate);
}
// Write the one or many read output files
for (Map.Entry rmap: writerMap.entrySet()) {
//TODO what to do if the readgroup isn't present
final String key = rmap.getKey();
JavaRDD filteredreads = rmap.getKey()==null? readsReverted :
readsReverted.filter(r -> r.getReadGroup().equals(key));
writeReads(ctx, rmap.getValue().toString(), filteredreads, headerMap.get(rmap.getKey()), false); //TODO proper header map
}
}
/**
* Runs the QualityEncodingDetector over a sampling of each readGroup present in the file to detect what the encoding format
* for base quality those reads are.
*
* @param reads Reads RDD over which to iterate and detect readgroups
* @param inHeader Header describing the readgroups present in the bam
* @param restoreOriginalQualities Whether to use the OQ tag for determining the map
* @return the best guess at the quality encoding format present for each readgroup based on the first {@link QualityEncodingDetector#DEFAULT_MAX_RECORDS_TO_ITERATE} reads in each readgroup.
*/
private Map createReadGroupFormatMap( final JavaRDD reads,
final Broadcast inHeader,
final boolean restoreOriginalQualities) {
final Map output = new HashMap<>();
inHeader.getValue().getReadGroups().stream().forEach(rg -> {
// For each readgroup filter down to just the reads in that group
final String key = rg.getId();
JavaRDD filtered = reads.filter(r -> r.getReadGroup().equals(key));
// NOTE: this method has the potential to be expensive as it may end up pulling on the first partition many times, and potentially
// end up iterating over the entire genome in the case where there are readgroups missing from the bam
if (!filtered.isEmpty()) {
// take the number of reads required by QualityEncodingDetector to determine quality score map
CloseableIterator iterator = new CloseableIterator() {
Iterator delegateIterator = filtered.take((int) QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE).iterator();
@Override
public void close() {
delegateIterator = null;
}
@Override
public boolean hasNext() {
return delegateIterator != null && delegateIterator.hasNext();
}
@Override
public SAMRecord next() {
if (!hasNext()) {
throw new NoSuchElementException("hasNext should be called before next");
}
return delegateIterator.next().convertToSAMRecord(inHeader.getValue());
}
};
// Save what the quality format for each readgroup was.
output.put(rg.getId(), QualityEncodingDetector.detect(
QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE,
iterator,
restoreOriginalQualities));
}
});
return output;
}
/**
* If this is run, we want to be careful to remove copied reads from the bam.
*
* In order to do this we group each read by its readname and randomly select one read labeled as first in pair
* and one read labeled as second in pair to treat as the representative reads, throwing away the rest.
*/
private JavaRDD sanitize(final Map readGroupToFormat, final JavaRDD reads, final SAMFileHeader header, final boolean keepFirstDuplicate) {
JavaRDD sortedReads = SparkUtils.querynameSortReadsIfNecessary(
reads.filter(r -> r.getLength() == r.getBaseQualityCount()),
getRecommendedNumReducers(), header);
JavaPairRDD> readsByGroup = spanReadsByKey(sortedReads);
return readsByGroup.flatMap(group -> {
final List out = Lists.newArrayList();
List primaryReads = Utils.stream(group._2()).collect(Collectors.toList());
// Get the number of R1s, R2s, and unpaired reads respectively.
int firsts = 0, seconds = 0, unpaired = 0;
GATKRead firstRecord = null, secondRecord = null, unpairedRecord = null;
for (final GATKRead rec : primaryReads) {
if (!rec.isPaired()) {
if (unpairedRecord == null) {
unpairedRecord = rec;
}
++unpaired;
} else {
if (rec.isFirstOfPair()) {
if (firstRecord == null) {
firstRecord = rec;
}
++firsts;
}
if (rec.isSecondOfPair()) {
if (secondRecord == null) {
secondRecord = rec;
}
++seconds;
}
}
}
// If we have paired reads, then check that there is exactly one first of pair and one second of pair.
// Otherwise, check that we have only one unpaired read.
if (firsts > 0 || seconds > 0) { // if we have any paired reads
if (firsts != 1 || seconds != 1) { // if we do not have exactly one R1 and one R2
if (keepFirstDuplicate && firsts >= 1 && seconds >= 1) { // if we have at least one R1 and one R2, we can discard all but an arbitrary one
primaryReads = Arrays.asList(firstRecord, secondRecord);
}
// Otherwise don't admit anything from this group
else {
return out.iterator();
}
}
} else if (unpaired > 1) { // only unpaired reads, and we have too many
if (keepFirstDuplicate) {
primaryReads = Collections.singletonList(unpairedRecord);
}
// Otherwise remove these reads entirely
else {
return out.iterator();
}
}
// If we've made it this far spit the records into the output!
for (final GATKRead rec : primaryReads) {
// The only valid quality score encoding scheme is standard; if it's not standard, change it.
final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
if (recordFormat != null && !recordFormat.equals(FastqQualityFormat.Standard)) {
final byte[] quals = rec.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
}
rec.setBaseQualities(quals);
}
out.add(rec);
}
return out.iterator();
});
}
/**
* Method which takes an RDD of reads that is guaranteed to have every readname group placed together on the same
* partition and maps those so a JavaPairRDD with the readname as the key.
*/
private static JavaPairRDD> spanReadsByKey(final JavaRDD reads) {
JavaPairRDD nameReadPairs = reads.mapToPair(read -> new Tuple2<>(read.getName(), read));
return SparkUtils.spanByKey(nameReadPairs).flatMapToPair(namedRead -> {
// for each name, separate reads by key (group name)
List>> out = Lists.newArrayList();
ListMultimap multi = LinkedListMultimap.create();
for (GATKRead read : namedRead._2()) {
multi.put(read.getName(), read);
}
for (String key : multi.keySet()) {
// list from Multimap is not serializable by Kryo, so put in a new array list
out.add(new Tuple2<>(key, Lists.newArrayList(multi.get(key))));
}
return out.iterator();
});
}
private Map getReadGroupHeaderMap(SAMFileHeader inHeader, Map writerMap) {
final Map headerMap;
if (outputByReadGroup) {
if (inHeader.getReadGroups().isEmpty()) {
throw new UserException("The header is missing its read group map");
}
assertAllReadGroupsMapped(writerMap, inHeader.getReadGroups());
headerMap = new HashMap<>();
for (final SAMReadGroupRecord readGroup : inHeader.getReadGroups()) {
final SAMFileHeader header = createOutHeader(inHeader, sortOrder, !keepAlignmentInformation);
header.addReadGroup(readGroup);
headerMap.put(readGroup.getId(), header);
}
} else {
final SAMFileHeader singleOutHeader = createOutHeader(inHeader, sortOrder, !keepAlignmentInformation);
inHeader.getReadGroups().forEach(singleOutHeader::addReadGroup);
headerMap = Collections.singletonMap(null, singleOutHeader);
}
return headerMap;
}
private static SAMFileHeader createOutHeader(
final SAMFileHeader inHeader,
final SAMFileHeader.SortOrder sortOrder,
final boolean removeAlignmentInformation) {
final SAMFileHeader outHeader = new SAMFileHeader();
outHeader.setSortOrder(sortOrder);
if (!removeAlignmentInformation) {
outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
outHeader.setProgramRecords(inHeader.getProgramRecords());
}
return outHeader;
}
@VisibleForTesting
static String getDefaultExtension(final GATKPath inputPath, final FileType setting) {
if (setting == FileType.dynamic) {
if (inputPath.isSam()) {
return FileExtensions.SAM;
} else if(inputPath.isCram()){
return FileExtensions.CRAM;
}
return FileExtensions.BAM;
} else {
return "." + setting.toString();
}
}
/**
* Takes an individual SAMRecord and applies the set of changes/reversions to it that
* have been requested by program level options.
*/
public JavaRDD revertReads(JavaRDD reads, List attributesToClear) {
Broadcast> attrBroadcast = JavaSparkContext.fromSparkContext(reads.context()).broadcast(attributesToClear);
if (!dontRestoreOriginalQualities) {
reads = reads.map(r -> {
final byte[] oq = ReadUtils.getOriginalBaseQualities(r);
if (oq != null) {
r.setBaseQualities(oq);
r.setAttribute("OQ", (String)null);
}
return r;
});
}
if (!keepDuplicateInformation) {
reads = reads.map(r -> {r.setIsDuplicate(false); return r;});
}
if (!keepAlignmentInformation) {
reads = reads.map(rec -> {
if (rec.isReverseStrand()) {
rec.reverseComplement();
rec.setIsReverseStrand(false);
}
// Remove all alignment based information about the read itself
rec.setIsUnplaced();
rec.setCigar(SAMRecord.NO_ALIGNMENT_CIGAR);
rec.setFragmentLength(0);
rec.setIsSecondaryAlignment(false);
rec.setIsProperlyPaired(false);
// Then remove any mate flags and info related to alignment
rec.setMateIsUnplaced();
// And then remove any tags that are calculated from the alignment
attrBroadcast.getValue().forEach(tag -> rec.setAttribute(tag, (String) null));
return rec;
});
}
return reads;
}
@VisibleForTesting
static Map getOutputMap(
final String outputMapFile,
final String outputDir,
final String defaultExtension,
final List readGroups,
final boolean outputByReadgroup) {
if (outputByReadgroup) {
final Map outputMap;
if (outputMapFile != null) {
try {
outputMap = createOutputMapFromFile(outputMapFile);
} catch (IOException e) {
throw new UserException("Encountered an error reading output map file", e);
}
} else {
outputMap = createOutputMapFromHeader(readGroups, outputDir, defaultExtension);
}
return outputMap;
} else {
return Collections.singletonMap(null, IOUtils.getPath(outputDir));
}
}
// Names the files based on the locations laid out in the readgroup map
private static Map createOutputMapFromFile(final String outputMapFile) throws IOException {
final Map outputMap = new HashMap<>();
try (final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMapFile, new TableCodec(null), false);) {
for (final TableFeature row : parser.iterator()) {
final String id = row.get(OUTPUT_MAP_READ_GROUP_FIELD_NAME);
final String output = row.get(OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME);
final Path outputPath = IOUtils.getPath(output);
outputMap.put(id, outputPath);
}
}
return outputMap;
}
// Names the files based on the readgroups individually presented in the header
private static Map createOutputMapFromHeader(final List readGroups, final String outputDir, final String extension) {
final Map outputMap = new HashMap<>();
for (final SAMReadGroupRecord readGroup : readGroups) {
final String id = readGroup.getId();
final String fileName = id + extension;
final Path outputPath = Paths.get(outputDir, fileName);
outputMap.put(id, outputPath);
}
return outputMap;
}
/**
* Methods used for validating parameters to RevertSam.
*/
static List validateOutputParams(final boolean outputByReadGroup, final String output, final String outputMap) {
final List errors = new ArrayList<>();
try {
if (outputByReadGroup) {
errors.addAll(validateOutputParamsByReadGroup(output, outputMap));
} else {
errors.addAll(validateOutputParamsNotByReadGroup(output, outputMap));
}
} catch (IOException e) {
throw new UserException.BadInput("Error while validating input file", e);
}
return errors;
}
@SuppressWarnings("unchecked")
static List validateOutputParamsByReadGroup(final String output, final String outputMap) throws IOException {
final List errors = new ArrayList<>();
if (output != null) {
if (!Files.isDirectory(IOUtil.getPath(output))) {
errors.add("When '--output-by-readgroup' is set and output is provided, it must be a directory: " + output);
}
return errors;
}
// output is null if we reached here
if (outputMap == null) {
errors.add("Must provide either output or outputMap when '--output-by-readgroup' is set.");
return errors;
}
if (!Files.isReadable(IOUtil.getPath(outputMap))) {
errors.add("Cannot read outputMap " + outputMap);
return errors;
}
final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMap, new TableCodec(null),false);
if (!isOutputMapHeaderValid((List)parser.getHeader())) {
errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with +"+OUTPUT_MAP_READ_GROUP_FIELD_NAME+"+ as first column and output as second column.");
}
return errors;
}
static List validateOutputParamsNotByReadGroup(final String output, final String outputMap) throws IOException {
final List errors = new ArrayList<>();
if (outputMap != null) {
errors.add("Cannot provide outputMap when '--output-by-read' isn't set. Provide output instead.");
}
if (output == null) {
errors.add("output is required when '--output-by-read'");
return errors;
}
if (Files.isDirectory(IOUtil.getPath(output))) {
errors.add("output " + output + " should not be a directory when '--output-by-read'");
}
return errors;
}
/**
* If we are going to override sampleAlias or libraryName, make sure all the read
* groups have the same values.
*/
static void validateHeaderOverrides(
final SAMFileHeader inHeader,
final String sampleAlias,
final String libraryName) {
final List rgs = inHeader.getReadGroups();
if (sampleAlias != null || libraryName != null) {
boolean allSampleAliasesIdentical = true;
boolean allLibraryNamesIdentical = true;
for (int i = 1; i < rgs.size(); i++) {
if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
allSampleAliasesIdentical = false;
}
if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
allLibraryNamesIdentical = false;
}
}
if (sampleAlias != null && !allSampleAliasesIdentical) {
throw new UserException("Read groups have multiple values for sample. " +
"A value for sampleAlias cannot be supplied.");
}
if (libraryName != null && !allLibraryNamesIdentical) {
throw new UserException("Read groups have multiple values for library name. " +
"A value for library name cannot be supplied.");
}
}
}
static void assertAllReadGroupsMapped(final Map outputMap, final List readGroups) {
for (final SAMReadGroupRecord readGroup : readGroups) {
final String id = readGroup.getId();
final Path output = outputMap.get(id);
if (output == null) {
throw new GATKException("Read group id " + id + " not found in outputMap " + outputMap);
}
}
}
static boolean isOutputMapHeaderValid(final List columnLabels) {
return columnLabels.size() >= 2 &&
OUTPUT_MAP_READ_GROUP_FIELD_NAME.equals(columnLabels.get(0)) &&
OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME.equals(columnLabels.get(1));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy