org.broadinstitute.hellbender.tools.spark.RevertSamSpark Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.spark;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import htsjdk.samtools.*;
import htsjdk.samtools.util.*;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.table.TableCodec;
import org.broadinstitute.hellbender.utils.codecs.table.TableFeature;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.spark.SparkUtils;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import scala.Tuple2;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Reverts a SAM file by optionally restoring original quality scores and by removing
 * all alignment information.
 * 
 * 

 * This tool removes or restores certain properties of the SAM records, including alignment information.
 * It can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of
 * restoring the original quality scores of a BAM file that has already undergone base quality score recalibration
 * (BQSR) if the original qualities were retained during the calibration (in the OQ tag).
 * 

 * 
Usage Examples
 * Output to a single file
 *  * gatk RevertSamSpark  \\
 *      -I input.bam \\
 *      -O reverted.bam
 * 
 * 
 * 
Output by read group into multiple files with sample map
 *  * gatk RevertSamSpark \\
 *      -I input.bam \\
 *      --output-by-readgroup \\
 *      --output-map reverted_bam_paths.tsv
 * 
 * 
 * 
Output by read group with no output map
 *  * gatk RevertSamSpark \\
 *      -I input.bam \\
 *      --output-by-readgroup \\
 *      -O /write/reverted/read/group/bams/in/this/dir
 * 
 * This will output a BAM (Can be overridden with outputByReadgroupFileFormat option.)
 * 

 * Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to
 * LENIENT or SILENT if the failures are expected to be obviated by the reversion process
 * (e.g. invalid alignment information will be obviated when the keepAlignmentInformation option is used).
 */

@DocumentedFeature
@CommandLineProgramProperties(
        summary = RevertSamSpark.USAGE_DETAILS,
        oneLineSummary = RevertSamSpark.USAGE_SUMMARY,
        programGroup = ReadDataManipulationProgramGroup.class)
@BetaFeature
public class RevertSamSpark extends GATKSparkTool {
    private static final long serialVersionUID = 1L;

    static final String USAGE_SUMMARY = "Reverts SAM, BAM or CRAM files to a previous state.";
    static final String USAGE_DETAILS = "This tool removes or restores certain properties of the SAM records, including alignment " +
            "information, which can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of " +
            "restoring the original quality scores of a BAM file that has already undergone base quality score recalibration (BQSR) if the" +
            "original qualities were retained.\n" +
            "Examples\n" +
            "Example with single output:\n" +
            "gatk RevertSamSpark \\\n" +
            "     -I input.bam \\\n" +
            "     -O reverted.bam\n" +
            "\n" +
            "Example outputting by read group with output map:\n" +
            "gatk RevertSamSpark \\\n" +
            "     -I input.bam \\\n" +
            "     --output-by-readgroup \\\n" +
            "     --output-map reverted_bam_paths.tsv\n" +
            "\n" +
            "Will output a BAM/CRAM/SAM file per read group.\n" +
            "Example outputting by read group without output map:\n" +
            "gatk RevertSamSpark \\\n" +
            "     -I input.bam \\\n" +
            "     --output-by-readgroup \\\n" +
            "     -O /write/reverted/read/group/bams/in/this/dir\n" +
            "\n" +
            "Will output a BAM file per read group." +
            " Output format can be overridden with the outputByReadgroupFileFormat option.\n" +
            "Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to " +
            "LENIENT or SILENT if the failures are expected to be obviated by the reversion process " +
            "(e.g. invalid alignment information will be obviated when the keep-alignment-information option is used).\n" +
            "";
    public static final String OUTPUT_MAP_READ_GROUP_FIELD_NAME = "READ_GROUP_ID";
    public static final String OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME = "OUTPUT";

    @Override
    public boolean requiresReads() { return true; }

    @Argument(mutex = {OUTPUT_MAP_LONG_NAME}, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            doc = "The output SAM/BAM/CRAM file to create, or an output directory if '--output-by-readgroup' is set.")
    public String output;

    public static final String OUTPUT_MAP_LONG_NAME = "output-map";
    @Argument(mutex = {StandardArgumentDefinitions.OUTPUT_LONG_NAME},
            fullName = OUTPUT_MAP_LONG_NAME,
            doc = "Tab separated file with two columns, OUTPUT_MAP_READ_GROUP_FIELD_NAME and OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME, providing file mapping only used if '--output-by-readgroup' is set.")
    public String outputMap;

    public static final String OUTPUT_BY_READGROUP_LONG_NAME = "output-by-readgroup";
    @Argument(fullName = OUTPUT_BY_READGROUP_LONG_NAME, doc = "When true, outputs each read group in a separate file.")
    public boolean outputByReadGroup = false;

    @Argument(doc = "WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " +
            "a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " +
            "mates, duplicated records, records with mismatches in length of bases and qualities. This option should " +
            "only be enabled if the output sort order is queryname and will always cause sorting to occur.")
    public boolean sanitize = false;

    public static final String KEEP_FIRST_DUPLICATE_LONG_NAME = "keep-first-duplicate";
    @Argument(doc = "If 'sanitize' only one record when we find more than one record with the same name for R1/R2/unpaired reads respectively. " +
            "For paired end reads, keeps only the first R1 and R2 found respectively, and discards all unpaired reads. " +
            "Duplicates do not refer to the duplicate flag in the FLAG field, but instead reads with the same name.",
            fullName = KEEP_FIRST_DUPLICATE_LONG_NAME)
    public boolean keepFirstDuplicate = false;

    public static final String OUTPUT_BY_READGROUP_FILE_FORMAT_LONG_NAME = "output-by-readgroup-file-format";
    @Argument(fullName = OUTPUT_BY_READGROUP_FILE_FORMAT_LONG_NAME, doc = "When using --output-by-readgroup, the output file format can be set to a certain format.")
    public FileType outputByReadgroupFileFormat = FileType.dynamic;

    @Argument(shortName = StandardArgumentDefinitions.SORT_ORDER_SHORT_NAME, fullName = StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, doc = "The sort order to create the reverted output file with, defaults to whatever is specified in the current file", optional = true)
    public SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.queryname;

    public static final String DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME = "dont-restore-original-qualities";
    @Argument( fullName = DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME, doc = "Set to prevent the tool from setting the OQ field to the QUAL where available.", optional = true)
    public boolean dontRestoreOriginalQualities = false;

    public static final String DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME = "remove-duplicate-information";
    @Argument(fullName = DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME, doc = "By default we remove duplicate read flags from all reads.  Note that if this is true, " +
            " the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.")
    public boolean keepDuplicateInformation = false;

    public static final String KEEP_ALIGNMENT_INFORMATION = "keep-alignment-information";
    @Argument(fullName = KEEP_ALIGNMENT_INFORMATION, doc = "Don't remove any of the alignment information from the file.")
    public boolean keepAlignmentInformation = false;

    public static final String ATTRIBUTE_TO_CLEAR_LONG_NAME = "attributes-to-clear";
    @Argument(fullName = ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true)
    public Set attributesToClear = new HashSet();

    public static final String REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_LONG_NAME = "remove-default-attributes-to-clear";
    @Argument(fullName = REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true)
    public boolean removeDefaults = false;

    public static final String SAMPLE_ALIAS_ARG = "sample-alias";
    @Argument(fullName = SAMPLE_ALIAS_ARG, doc = "The sample alias to use in the reverted output file.  This will override the existing " +
            "sample alias in the file and is used only if all the read groups in the input file have the " +
            "same sample alias.", shortName = StandardArgumentDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional = true)
    public String sampleAlias;

    public static final String LIBRARY_NAME_ARG = "library-name";
    @Argument(fullName = LIBRARY_NAME_ARG, doc = "The library name to use in the reverted output file.  This will override the existing " +
            "sample alias in the file and is used only if all the read groups in the input file have the " +
            "same library name.", optional = true)
    public String libraryName;

    @Override
    public List getDefaultReadFilters() {
        return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS);
    }

    final public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){
        private static final long serialVersionUID = 1L;{
        add(SAMTag.NM.name());
        add(SAMTag.UQ.name());
        add(SAMTag.PG.name());
        add(SAMTag.MD.name());
        add(SAMTag.MQ.name());
        add(SAMTag.SA.name()); // Supplementary alignment metadata
        add(SAMTag.MC.name()); // Mate Cigar
        add(SAMTag.AS.name());
    }});

    public enum FileType implements CommandLineParser.ClpEnum {
        sam("Generate SAM files."),
        bam("Generate BAM files."),
        cram("Generate CRAM files."),
        dynamic("Generate files based on the extension of input.");

        final String description;

        FileType(String description) {
            this.description = description;
        }

        @Override
        public String getHelpDoc() {
            return description;
        }
    }

    /**
     * Enforce that output ordering is queryname when sanitization is turned on since it requires a queryname sort.
     * Also checks to ensure that the user has chosen a valid subset of arguments pertaining to output and sanitization.
     */
    @Override
    protected String[] customCommandLineValidation() {
        final List errors = new ArrayList<>();
        validateOutputParams(outputByReadGroup, output, outputMap);

        if (!sanitize && keepFirstDuplicate) {
            errors.add("'keepFirstDuplicate' cannot be used without 'sanitize'");
        }

        if (!errors.isEmpty()) {
            return errors.toArray(new String[errors.size()]);
        }
        return null;
    }

    @Override
    protected void runTool(JavaSparkContext ctx) {
        Broadcast headerBroadcast = ctx.broadcast(getHeaderForReads());
        JavaRDD reads = getReads();

        ////////////////////////////////////////////////////////////////////////////
        // Grab the input header and remap values where appropriate
        ////////////////////////////////////////////////////////////////////////////
        SAMFileHeader localHeader = headerBroadcast.getValue();
        validateHeaderOverrides(localHeader, sampleAlias, libraryName);
        if (sampleAlias != null) {
            localHeader.getReadGroups().forEach(rg -> rg.setSample(sampleAlias));
        }
        if (libraryName != null) {
            localHeader.getReadGroups().forEach(rg -> rg.setLibrary(libraryName));
        }

        ////////////////////////////////////////////////////////////////////////////
        // Map the readgroups in the header to appropriate
        ////////////////////////////////////////////////////////////////////////////
        Map writerMap = getOutputMap(outputMap,
                                                  output,
                                                  getDefaultExtension(readArguments.getReadPathSpecifiers().get(0), outputByReadgroupFileFormat),
                                                  localHeader.getReadGroups(),
                                                  outputByReadGroup);

        ////////////////////////////////////////////////////////////////////////////
        // Construct appropriate headers for the output files
        ////////////////////////////////////////////////////////////////////////////
        final Map headerMap = getReadGroupHeaderMap(localHeader, writerMap);

        // Revert the reads based on the given attributes
        List attributesToRevert = removeDefaults ? DEFAULT_ATTRIBUTES_TO_CLEAR : new ArrayList<>();
        attributesToRevert.addAll(attributesToClear);
        JavaRDD readsReverted = revertReads(reads, attributesToRevert);

        ////////////////////////////////////////////////////////////////////////////
        // Sanitize the reads, sorting them into appropriate order if necessary
        ////////////////////////////////////////////////////////////////////////////
        if (sanitize) {
            Map readGroupFormatMap = createReadGroupFormatMap(readsReverted, headerBroadcast, !dontRestoreOriginalQualities);

            readsReverted = sanitize(readGroupFormatMap, readsReverted, localHeader, keepFirstDuplicate);
        }

        // Write the one or many read output files
        for (Map.Entry rmap: writerMap.entrySet()) {
            //TODO what to do if the readgroup isn't present
            final String key = rmap.getKey();
            JavaRDD filteredreads = rmap.getKey()==null? readsReverted :
                                                                    readsReverted.filter(r -> r.getReadGroup().equals(key));
            writeReads(ctx, rmap.getValue().toString(), filteredreads, headerMap.get(rmap.getKey()), false); //TODO proper header map
        }
    }

    /**
     * Runs the QualityEncodingDetector over a sampling of each readGroup present in the file to detect what the encoding format
     * for base quality those reads are.
     *
     * @param reads Reads RDD over which to iterate and detect readgroups
     * @param inHeader Header describing the readgroups present in the bam
     * @param restoreOriginalQualities Whether to use the OQ tag for determining the map
     * @return the best guess at the quality encoding format present for each readgroup based on the first {@link QualityEncodingDetector#DEFAULT_MAX_RECORDS_TO_ITERATE} reads in each readgroup.
     */
    private Map createReadGroupFormatMap( final JavaRDD reads,
                                                                      final Broadcast inHeader,
                                                                      final boolean restoreOriginalQualities) {
        final Map output = new HashMap<>();

        inHeader.getValue().getReadGroups().stream().forEach(rg -> {
            // For each readgroup filter down to just the reads in that group
            final String key = rg.getId();
            JavaRDD filtered = reads.filter(r -> r.getReadGroup().equals(key));

            // NOTE: this method has the potential to be expensive as it may end up pulling on the first partition many times, and potentially
            //       end up iterating over the entire genome in the case where there are readgroups missing from the bam
            if (!filtered.isEmpty()) {

                // take the number of reads required by QualityEncodingDetector to determine quality score map
                CloseableIterator iterator = new CloseableIterator() {
                    Iterator delegateIterator = filtered.take((int) QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE).iterator();

                    @Override
                    public void close() {
                        delegateIterator = null;
                    }

                    @Override
                    public boolean hasNext() {
                        return delegateIterator != null && delegateIterator.hasNext();
                    }

                    @Override
                    public SAMRecord next() {
                        if (!hasNext()) {
                            throw new NoSuchElementException("hasNext should be called before next");
                        }
                        return delegateIterator.next().convertToSAMRecord(inHeader.getValue());
                    }
                };

                // Save what the quality format for each readgroup was.
                output.put(rg.getId(), QualityEncodingDetector.detect(
                        QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE,
                        iterator,
                        restoreOriginalQualities));
            }
        });
        return output;
    }

    /**
     * If this is run, we want to be careful to remove copied reads from the bam.
     *
     * In order to do this we group each read by its readname and randomly select one read labeled as first in pair
     * and one read labeled as second in pair to treat as the representative reads, throwing away the rest.
     */
    private JavaRDD sanitize(final Map readGroupToFormat, final JavaRDD reads, final SAMFileHeader header, final boolean keepFirstDuplicate) {
        JavaRDD sortedReads = SparkUtils.querynameSortReadsIfNecessary(
                reads.filter(r -> r.getLength() == r.getBaseQualityCount()),
                getRecommendedNumReducers(), header);
        JavaPairRDD> readsByGroup = spanReadsByKey(sortedReads);

        return readsByGroup.flatMap(group -> {
            final List out = Lists.newArrayList();

            List primaryReads = Utils.stream(group._2()).collect(Collectors.toList());

            // Get the number of R1s, R2s, and unpaired reads respectively.
            int firsts = 0, seconds = 0, unpaired = 0;
            GATKRead firstRecord = null, secondRecord = null, unpairedRecord = null;
            for (final GATKRead rec : primaryReads) {
                if (!rec.isPaired()) {
                    if (unpairedRecord == null) {
                        unpairedRecord = rec;
                    }
                    ++unpaired;
                } else {
                    if (rec.isFirstOfPair()) {
                        if (firstRecord == null) {
                            firstRecord = rec;
                        }
                        ++firsts;
                    }
                    if (rec.isSecondOfPair()) {
                        if (secondRecord == null) {
                            secondRecord = rec;
                        }
                        ++seconds;
                    }
                }
            }

            // If we have paired reads, then check that there is exactly one first of pair and one second of pair.
            // Otherwise, check that we have only one unpaired read.
            if (firsts > 0 || seconds > 0) { // if we have any paired reads
                if (firsts != 1 || seconds != 1) { // if we do not have exactly one R1 and one R2
                    if (keepFirstDuplicate && firsts >= 1 && seconds >= 1) { // if we have at least one R1 and one R2, we can discard all but an arbitrary one
                        primaryReads = Arrays.asList(firstRecord, secondRecord);
                    }
                    // Otherwise don't admit anything from this group
                    else {
                        return out.iterator();
                    }
                }
            } else if (unpaired > 1) { // only unpaired reads, and we have too many
                if (keepFirstDuplicate) {
                    primaryReads = Collections.singletonList(unpairedRecord);
                }
                // Otherwise remove these reads entirely
                else {
                    return out.iterator();
                }
            }

            // If we've made it this far spit the records into the output!
            for (final GATKRead rec : primaryReads) {
                // The only valid quality score encoding scheme is standard; if it's not standard, change it.
                final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup());
                if (recordFormat != null && !recordFormat.equals(FastqQualityFormat.Standard)) {
                    final byte[] quals = rec.getBaseQualities();
                    for (int i = 0; i < quals.length; i++) {
                        quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND;
                    }
                    rec.setBaseQualities(quals);
                }
                out.add(rec);
            }
            return out.iterator();
        });

    }

    /**
     * Method which takes an RDD of reads that is guaranteed to have every readname group placed together on the same
     * partition and maps those so a JavaPairRDD with the readname as the key.
     */
    private static JavaPairRDD> spanReadsByKey(final JavaRDD reads) {
        JavaPairRDD nameReadPairs = reads.mapToPair(read -> new Tuple2<>(read.getName(), read));
        return SparkUtils.spanByKey(nameReadPairs).flatMapToPair(namedRead -> {
            // for each name, separate reads by key (group name)
            List>> out = Lists.newArrayList();
            ListMultimap multi = LinkedListMultimap.create();
            for (GATKRead read : namedRead._2()) {
                multi.put(read.getName(), read);
            }
            for (String key : multi.keySet()) {
                // list from Multimap is not serializable by Kryo, so put in a new array list
                out.add(new Tuple2<>(key, Lists.newArrayList(multi.get(key))));
            }
            return out.iterator();
        });
    }


    private Map getReadGroupHeaderMap(SAMFileHeader inHeader, Map writerMap) {
        final Map headerMap;
        if (outputByReadGroup) {
            if (inHeader.getReadGroups().isEmpty()) {
                throw new UserException("The header is missing its read group map");
            }

            assertAllReadGroupsMapped(writerMap, inHeader.getReadGroups());
            headerMap = new HashMap<>();
            for (final SAMReadGroupRecord readGroup : inHeader.getReadGroups()) {
                final SAMFileHeader header = createOutHeader(inHeader, sortOrder, !keepAlignmentInformation);
                header.addReadGroup(readGroup);
                headerMap.put(readGroup.getId(), header);
            }
        } else {
            final SAMFileHeader singleOutHeader = createOutHeader(inHeader, sortOrder, !keepAlignmentInformation);
            inHeader.getReadGroups().forEach(singleOutHeader::addReadGroup);
            headerMap = Collections.singletonMap(null, singleOutHeader);
        }
        return headerMap;
    }

    private static SAMFileHeader createOutHeader(
            final SAMFileHeader inHeader,
            final SAMFileHeader.SortOrder sortOrder,
            final boolean removeAlignmentInformation) {
        final SAMFileHeader outHeader = new SAMFileHeader();
        outHeader.setSortOrder(sortOrder);
        if (!removeAlignmentInformation) {
            outHeader.setSequenceDictionary(inHeader.getSequenceDictionary());
            outHeader.setProgramRecords(inHeader.getProgramRecords());
        }
        return outHeader;
    }

    @VisibleForTesting
    static String getDefaultExtension(final GATKPath inputPath, final FileType setting) {
        if (setting == FileType.dynamic) {
            if (inputPath.isSam()) {
                return FileExtensions.SAM;
            } else if(inputPath.isCram()){
                return FileExtensions.CRAM;
            }
            return FileExtensions.BAM;
        } else {
            return "." + setting.toString();
        }
    }

    /**
     * Takes an individual SAMRecord and applies the set of changes/reversions to it that
     * have been requested by program level options.
     */
    public JavaRDD revertReads(JavaRDD reads, List attributesToClear) {
        Broadcast> attrBroadcast = JavaSparkContext.fromSparkContext(reads.context()).broadcast(attributesToClear);

        if (!dontRestoreOriginalQualities) {
            reads = reads.map(r -> {
                final byte[] oq = ReadUtils.getOriginalBaseQualities(r);
                if (oq != null) {
                    r.setBaseQualities(oq);
                    r.setAttribute("OQ", (String)null);
                }
                return r;
            });
        }

        if (!keepDuplicateInformation) {
            reads = reads.map(r -> {r.setIsDuplicate(false); return r;});
        }

        if (!keepAlignmentInformation) {
            reads = reads.map(rec -> {
                if (rec.isReverseStrand()) {
                    rec.reverseComplement();
                    rec.setIsReverseStrand(false);
                }

                // Remove all alignment based information about the read itself
                rec.setIsUnplaced();
                rec.setCigar(SAMRecord.NO_ALIGNMENT_CIGAR);

                rec.setFragmentLength(0);
                rec.setIsSecondaryAlignment(false);
                rec.setIsProperlyPaired(false);

                // Then remove any mate flags and info related to alignment
                rec.setMateIsUnplaced();

                // And then remove any tags that are calculated from the alignment
                attrBroadcast.getValue().forEach(tag -> rec.setAttribute(tag, (String) null));
                return rec;
            });
        }

        return reads;
    }

    @VisibleForTesting
    static Map getOutputMap(
            final String outputMapFile,
            final String outputDir,
            final String defaultExtension,
            final List readGroups,
            final boolean outputByReadgroup) {

        if (outputByReadgroup) {
            final Map outputMap;
            if (outputMapFile != null) {
                try {
                    outputMap = createOutputMapFromFile(outputMapFile);
                } catch (IOException e) {
                    throw new UserException("Encountered an error reading output map file", e);
                }
            } else {
                outputMap = createOutputMapFromHeader(readGroups, outputDir, defaultExtension);
            }
            return outputMap;
        } else {
            return Collections.singletonMap(null, IOUtils.getPath(outputDir));
        }
    }

    // Names the files based on the locations laid out in the readgroup map
    private static Map createOutputMapFromFile(final String outputMapFile) throws IOException {
        final Map outputMap = new HashMap<>();
        try (final FeatureReader  parser = AbstractFeatureReader.getFeatureReader(outputMapFile, new TableCodec(null), false);) {
            for (final TableFeature row : parser.iterator()) {
                final String id = row.get(OUTPUT_MAP_READ_GROUP_FIELD_NAME);
                final String output = row.get(OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME);
                final Path outputPath = IOUtils.getPath(output);
                outputMap.put(id, outputPath);
            }
        }
        return outputMap;
    }

    // Names the files based on the readgroups individually presented in the header
    private static Map createOutputMapFromHeader(final List readGroups, final String outputDir, final String extension) {
        final Map outputMap = new HashMap<>();
        for (final SAMReadGroupRecord readGroup : readGroups) {
            final String id = readGroup.getId();
            final String fileName = id + extension;
            final Path outputPath = Paths.get(outputDir, fileName);
            outputMap.put(id, outputPath);
        }
        return outputMap;
    }

    /**
     * Methods used for validating parameters to RevertSam.
     */
    static List validateOutputParams(final boolean outputByReadGroup, final String output, final String outputMap) {
        final List errors = new ArrayList<>();
        try {
            if (outputByReadGroup) {
                errors.addAll(validateOutputParamsByReadGroup(output, outputMap));
            } else {
                errors.addAll(validateOutputParamsNotByReadGroup(output, outputMap));
            }
        } catch (IOException e) {
            throw new UserException.BadInput("Error while validating input file", e);
        }
        return errors;
    }

    @SuppressWarnings("unchecked")
    static List  validateOutputParamsByReadGroup(final String output, final String outputMap) throws IOException {
        final List errors = new ArrayList<>();
        if (output != null) {
            if (!Files.isDirectory(IOUtil.getPath(output))) {
                errors.add("When '--output-by-readgroup' is set and output is provided, it must be a directory: " + output);
            }
            return errors;
        }
        // output is null if we reached here
        if (outputMap == null) {
            errors.add("Must provide either output or outputMap when '--output-by-readgroup' is set.");
            return errors;
        }
        if (!Files.isReadable(IOUtil.getPath(outputMap))) {
            errors.add("Cannot read outputMap " + outputMap);
            return errors;
        }
        final FeatureReader  parser = AbstractFeatureReader.getFeatureReader(outputMap, new TableCodec(null),false);
        if (!isOutputMapHeaderValid((List)parser.getHeader())) {
            errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with +"+OUTPUT_MAP_READ_GROUP_FIELD_NAME+"+ as first column and output as second column.");
        }
        return errors;
    }

    static List validateOutputParamsNotByReadGroup(final String output, final String outputMap) throws IOException {
        final List errors = new ArrayList<>();
        if (outputMap != null) {
            errors.add("Cannot provide outputMap when '--output-by-read' isn't set. Provide output instead.");
        }
        if (output == null) {
            errors.add("output is required when '--output-by-read'");
            return errors;
        }
        if (Files.isDirectory(IOUtil.getPath(output))) {
            errors.add("output " + output + " should not be a directory when '--output-by-read'");
        }
        return errors;
    }

    /**
     * If we are going to override sampleAlias or libraryName, make sure all the read
     * groups have the same values.
     */
    static void validateHeaderOverrides(
            final SAMFileHeader inHeader,
            final String sampleAlias,
            final String libraryName) {

        final List rgs = inHeader.getReadGroups();
        if (sampleAlias != null || libraryName != null) {
            boolean allSampleAliasesIdentical = true;
            boolean allLibraryNamesIdentical = true;
            for (int i = 1; i < rgs.size(); i++) {
                if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) {
                    allSampleAliasesIdentical = false;
                }
                if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) {
                    allLibraryNamesIdentical = false;
                }
            }
            if (sampleAlias != null && !allSampleAliasesIdentical) {
                throw new UserException("Read groups have multiple values for sample.  " +
                        "A value for sampleAlias cannot be supplied.");
            }
            if (libraryName != null && !allLibraryNamesIdentical) {
                throw new UserException("Read groups have multiple values for library name.  " +
                        "A value for library name cannot be supplied.");
            }
        }
    }

    static void assertAllReadGroupsMapped(final Map outputMap, final List readGroups) {
        for (final SAMReadGroupRecord readGroup : readGroups) {
            final String id = readGroup.getId();
            final Path output = outputMap.get(id);
            if (output == null) {
                throw new GATKException("Read group id " + id + " not found in outputMap " + outputMap);
            }
        }
    }

    static boolean isOutputMapHeaderValid(final List columnLabels) {
        return columnLabels.size() >= 2 &&
                OUTPUT_MAP_READ_GROUP_FIELD_NAME.equals(columnLabels.get(0)) &&
                OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME.equals(columnLabels.get(1));
    }

}