picard.sam.markduplicates.EstimateLibraryComplexity Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
picard
There is a newer version: 3.2.0
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.sam.markduplicates;

import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Metrics;
import picard.sam.DuplicationMetrics;
import picard.sam.markduplicates.util.AbstractOpticalDuplicateFinderCommandLineProgram;
import picard.sam.util.PhysicalLocationShort;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;

import static java.lang.Math.pow;

/**
 * Attempts to estimate library complexity from sequence alone. Does so by sorting all reads
 * by the first N bases (5 by default) of each read and then comparing reads with the first
 * N bases identical to each other for duplicates.  Reads are considered to be duplicates if
 * they match each other with no gaps and an overall mismatch rate less than or equal to
 * MAX_DIFF_RATE (0.03 by default).
 * 
 * 
Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering
 * removes reads with any no-calls in the first N bases or with a mean base quality lower than
 * MIN_MEAN_QUALITY across either the first or second read.
 * 
 * 
The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes
 * these in the calculation of library size. Also, since there is no alignment to screen out technical
 * reads one further filter is applied on the data.  After examining all reads a Histogram is built of
 * [#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are
 * then removed from the Histogram as outliers before library size is estimated.
 *
 * @author Tim Fennell
 */
@CommandLineProgramProperties(
        usage = EstimateLibraryComplexity.USAGE_SUMMARY + EstimateLibraryComplexity.USAGE_DETAILS,
        usageShort = EstimateLibraryComplexity.USAGE_SUMMARY,
        programGroup = Metrics.class
)
public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCommandLineProgram {
    static final String USAGE_SUMMARY = "Estimates the numbers of unique molecules in a sequencing library.  ";
    static final String USAGE_DETAILS = "This tool outputs quality metrics for a sequencing library preparation." +
            "Library complexity refers to the number of unique DNA fragments present in a given library.  Reductions in complexity " +
            "resulting from PCR amplification during library preparation will ultimately compromise downstream analyses " +
            "via an elevation in the number of duplicate reads.  PCR-associated duplication artifacts can result from: inadequate amounts " +
            "of starting material (genomic DNA, cDNA, etc.), losses during cleanups, and size selection issues.  " +
            "Duplicate reads can also arise from optical duplicates resulting from sequencing-machine optical sensor artifacts.  " +

            "This tool attempts to estimate library complexity from sequence of read pairs alone.  Reads are sorted by the first N bases " +
            "(5 by default) of the first read and then the first N bases of the second read of a pair.   Read pairs are considered to " +
            "be duplicates if they match each other with no gaps and an overall mismatch rate less than or equal to MAX_DIFF_RATE " +
            "(0.03 by default).  Reads of poor quality are filtered out to provide a more accurate estimate.  The filtering removes reads" +
            " with any poor quality bases as defined by a read's MIN_MEAN_QUALITY (20 is the default value) across either the first or " +
            "second read.  Unpaired reads are ignored in this computation. " +
            "" +
            "The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes these in the calculation " +
            "of library size.  Also, since there is no alignment information used in this algorithm, an additional filter is applied to " +
            "the data as follows.  After examining all reads, a histogram is built in which the number of reads in a duplicate set is " +
            "compared with the number of of duplicate sets.   All bins that contain exactly one duplicate set are then removed from the " +
            "histogram as outliers prior to the library size estimation.  " +

            "Usage example:" +
            "" +
            "java -jar picard.jar EstimateLibraryComplexity \\
" +
            "     I=input.bam \\
" +
            "     O=est_lib_complex_metrics.txt" +
            "" +
            "Please see the documentation for the companion " +
            "MarkDuplicates tool." +
            "";
    @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "One or more files to combine and " +
            "estimate library complexity from. Reads can be mapped or unmapped.")
    public List INPUT;

    @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
            doc = "Output file to writes per-library metrics to.")
    public File OUTPUT;

    @Option(doc = "The minimum number of bases at the starts of reads that must be identical for reads to " +
            "be grouped together for duplicate detection.  In effect total_reads / 4^max_id_bases reads will " +
            "be compared at a time, so lower numbers will produce more accurate results but consume " +
            "exponentially more memory and CPU.")
    public int MIN_IDENTICAL_BASES = 5;

    @Option(doc = "The maximum rate of differences between two reads to call them identical.")
    public double MAX_DIFF_RATE = 0.03;

    @Option(doc = "The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with " +
            "lower average quality are filtered out and not considered in any calculations.")
    public int MIN_MEAN_QUALITY = 20;

    @Option(doc = "Do not process self-similar groups that are this many times over the mean expected group size. " +
            "I.e. if the input contains 10m read pairs and MIN_IDENTICAL_BASES is set to 5, then the mean expected " +
            "group size would be approximately 10 reads.")
    public int MAX_GROUP_RATIO = 500;

    @Option(doc = "Barcode SAM tag (ex. BC for 10X Genomics)", optional = true)
    public String BARCODE_TAG = null;

    @Option(doc = "Read one barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
    public String READ_ONE_BARCODE_TAG = null;

    @Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
    public String READ_TWO_BARCODE_TAG = null;

    @Option(doc = "The maximum number of bases to consider when comparing reads (0 means no maximum).", optional = true)
    public int MAX_READ_LENGTH = 0;

    @Option(doc = "Minimum number group count.  On a per-library basis, we count the number of groups of duplicates " +
            "that have a particular size.  Omit from consideration any count that is less than this value.  For " +
            "example, if we see only one group of duplicates with size 500, we omit it from the metric calculations if " +
            "MIN_GROUP_COUNT is set to two.  Setting this to two may help remove technical artifacts from the library " +
            "size calculation, for example, adapter dimers.", optional = true)
    public int MIN_GROUP_COUNT = 2;

    private final Log log = Log.getInstance(EstimateLibraryComplexity.class);

    @Override
    protected String[] customCommandLineValidation() {
        final List errorMsgs = new ArrayList();
        if (0 < MAX_READ_LENGTH && MAX_READ_LENGTH < MIN_IDENTICAL_BASES) {
            errorMsgs.add("MAX_READ_LENGTH must be greater than MIN_IDENTICAL_BASES");
        }
        if (MIN_IDENTICAL_BASES <= 0) {
            errorMsgs.add("MIN_IDENTICAL_BASES must be greater than 0");
        }
        return errorMsgs.isEmpty() ? super.customCommandLineValidation() : errorMsgs.toArray(new String[errorMsgs.size()]);
    }

    /**
     * Little class to hold the sequence of a pair of reads and tile location information.
     */
    static class PairedReadSequence extends PhysicalLocationShort {

        //just for rough estimate size of reads, does not affect the fundamental operation of the algorithm
        static final int NUMBER_BASES_IN_READ = 150;

        short readGroup = -1;
        boolean qualityOk = true;
        byte[] read1;
        byte[] read2;
        short libraryId;

        // Hashes corresponding to read1 and read2
        int[] hashes1;
        int[] hashes2;

        public static int getSizeInBytes() {
            // rough guess at memory footprint, summary size of all fields
            return 16 + 4 + (2 * 4) + 1 + 2 * (24 + 8 + NUMBER_BASES_IN_READ) + 2 + (2 * (24 + 8)) + 8 + 4;
        }

        public short getReadGroup() { return this.readGroup; }

        public void setReadGroup(final short readGroup) { this.readGroup = readGroup; }

        public short getLibraryId() { return this.libraryId; }

        public void setLibraryId(final short libraryId) { this.libraryId = libraryId; }

        public static SortingCollection.Codec getCodec() {
            return new PairedReadCodec();
        }

        void initHashes(int numberOfHashes, int skippedBases, int minReadLength) {
            hashes1 = getHashes(read1, numberOfHashes, skippedBases, minReadLength);
            hashes2 = getHashes(read2, numberOfHashes, skippedBases, minReadLength);
        }

        // Split read by numberOfHashes parts and hash each part
        // For instance:
        //        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
        // read = A C G T A C G T A C G  T  A  C  G  T  A  C  G  T
        // numberOfHashes = 5
        // skippedBases = 1
        // minReadLength = 15
        // So, method returns hashValues with 5 hash value
        // first value calculated from read[1], read[6], read[11]
        // second value calculated from read[2], read[7], read[12]
        // etc.
        // chars from 16 to 19 position is a tail, see compareTails() in ElcHashBasedDuplicatesFinder
        private int[] getHashes(byte[] read, int numberOfHashes, int skippedBases, int minReadLength) {
            final int[] hashValues = new int[numberOfHashes];
            for (int i = 0; i < numberOfHashes; ++i) {
                hashValues[i] = 1;
                int position = skippedBases + i;
                while (position < minReadLength) {
                    hashValues[i] = 31 * hashValues[i] + read[position];
                    position += numberOfHashes;
                }
            }
            return hashValues;
        }
    }

    static class PairedReadSequenceWithBarcodes extends PairedReadSequence {
        int barcode; // primary barcode for this read (and pair)
        int readOneBarcode; // read one barcode, 0 if not present
        int readTwoBarcode; // read two barcode, 0 if not present or not paired

        public PairedReadSequenceWithBarcodes() {
            super();
        }

        public PairedReadSequenceWithBarcodes(final PairedReadSequence val) {
            if (null == val) throw new PicardException("val was null");
            this.readGroup = val.getReadGroup();
            this.tile = val.getTile();
            this.x = val.getX();
            this.y = val.getY();
            this.qualityOk = val.qualityOk;
            this.read1 = val.read1.clone();
            this.read2 = val.read2.clone();
            this.libraryId = val.getLibraryId();
        }

        public static int getSizeInBytes() {
            return PairedReadSequence.getSizeInBytes() + (3 * 4); // rough guess at memory footprint
        }
    }

    /**
     * Codec class for writing and read PairedReadSequence objects.
     */
    static class PairedReadCodec implements SortingCollection.Codec {
        protected DataOutputStream out;
        protected DataInputStream in;

        public void setOutputStream(final OutputStream out) {
            this.out = new DataOutputStream(out);
        }

        public void setInputStream(final InputStream in) {
            this.in = new DataInputStream(in);
        }

        public void encode(final PairedReadSequence val) {
            try {
                this.out.writeShort(val.readGroup);
                this.out.writeShort(val.tile);
                this.out.writeShort(val.x);
                this.out.writeShort(val.y);
                this.out.writeInt(val.read1.length);
                this.out.write(val.read1);
                this.out.writeInt(val.read2.length);
                this.out.write(val.read2);
            } catch (final IOException ioe) {
                throw new PicardException("Error write out read pair.", ioe);
            }
        }

        public PairedReadSequence decode() {
            try {
                final PairedReadSequence val = new PairedReadSequence();
                try {
                    val.readGroup = this.in.readShort();
                } catch (final EOFException eof) {
                    return null;
                }

                val.tile = this.in.readShort();
                val.x = this.in.readShort();
                val.y = this.in.readShort();

                int length = this.in.readInt();
                val.read1 = new byte[length];
                if (this.in.read(val.read1) != length) {
                    throw new PicardException("Could not read " + length + " bytes from temporary file.");
                }

                length = this.in.readInt();
                val.read2 = new byte[length];
                if (this.in.read(val.read2) != length) {
                    throw new PicardException("Could not read " + length + " bytes from temporary file.");
                }

                return val;
            } catch (final IOException ioe) {
                throw new PicardException("Exception reading read pair.", ioe);
            }
        }

        @Override
        public SortingCollection.Codec clone() { return new PairedReadCodec(); }
    }


    /**
     * Codec class for writing and read PairedReadSequence objects.
     */
    static class PairedReadWithBarcodesCodec extends PairedReadCodec {
        @Override
        public void encode(final PairedReadSequence val) {
            if (!(val instanceof PairedReadSequenceWithBarcodes)) {
                throw new PicardException("Val was not a PairedReadSequenceWithBarcodes");
            }
            final PairedReadSequenceWithBarcodes data = (PairedReadSequenceWithBarcodes) val;

            super.encode(val);

            try {
                this.out.writeInt(data.barcode);
                this.out.writeInt(data.readOneBarcode);
                this.out.writeInt(data.readTwoBarcode);
            } catch (final IOException ioe) {
                throw new PicardException("Error write out read pair.", ioe);
            }
        }

        @Override
        public PairedReadSequence decode() {
            try {
                final PairedReadSequence parentVal = super.decode();
                if (null == parentVal) return null; // EOF
                final PairedReadSequenceWithBarcodes val = new PairedReadSequenceWithBarcodes(parentVal);
                val.barcode = this.in.readInt();
                val.readOneBarcode = this.in.readInt();
                val.readTwoBarcode = this.in.readInt();

                return val;
            } catch (final IOException ioe) {
                throw new PicardException("Exception reading read pair.", ioe);
            }
        }

        @Override
        public SortingCollection.Codec clone() { return new PairedReadWithBarcodesCodec(); }
    }

    /**
     * Comparator that orders read pairs on the first N bases of both reads.
     * There is no tie-breaking, so any sort is stable, not total.
     */
    private class PairedReadComparator implements Comparator {
        final int BASES = EstimateLibraryComplexity.this.MIN_IDENTICAL_BASES;

        public int compare(final PairedReadSequence lhs, final PairedReadSequence rhs) {
            // First compare the first N bases of the first read
            for (int i = 0; i < BASES; ++i) {
                final int retval = lhs.read1[i] - rhs.read1[i];
                if (retval != 0) return retval;
            }

            // Then compare the first N bases of the second read
            for (int i = 0; i < BASES; ++i) {
                final int retval = lhs.read2[i] - rhs.read2[i];
                if (retval != 0) return retval;
            }

            return 0;
        }
    }

    public int getBarcodeValue(final SAMRecord record) {
        return getReadBarcodeValue(record, BARCODE_TAG);
    }

    public static int getReadBarcodeValue(final SAMRecord record, final String tag) {
        if (null == tag) return 0;
        final String attr = record.getStringAttribute(tag);
        if (null == attr) return 0;
        else return attr.hashCode();
    }

    private int getReadOneBarcodeValue(final SAMRecord record) {
        return getReadBarcodeValue(record, READ_ONE_BARCODE_TAG);
    }

    private int getReadTwoBarcodeValue(final SAMRecord record) {
        return getReadBarcodeValue(record, READ_TWO_BARCODE_TAG);
    }

    /** Stock main method. */
    public static void main(final String[] args) {
        new EstimateLibraryComplexity().instanceMainWithExit(args);
    }

    public EstimateLibraryComplexity() {
        final int sizeInBytes;
        if (null != BARCODE_TAG || null != READ_ONE_BARCODE_TAG || null != READ_TWO_BARCODE_TAG) {
            sizeInBytes = PairedReadSequenceWithBarcodes.getSizeInBytes();
        } else {
            sizeInBytes = PairedReadSequence.getSizeInBytes();
        }
        MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2;
    }

    /**
     * Method that does most of the work.  Reads through the input BAM file and extracts the
     * read sequences of each read pair and sorts them via a SortingCollection.  Then traverses
     * the sorted reads and looks at small groups at a time to find duplicates.
     */
    @Override
    protected int doWork() {
        for (final File f : INPUT) IOUtil.assertFileIsReadable(f);

        log.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting.");

        final List readGroups = new ArrayList();
        final SortingCollection sorter;
        final boolean useBarcodes = (null != BARCODE_TAG || null != READ_ONE_BARCODE_TAG || null != READ_TWO_BARCODE_TAG);

        if (!useBarcodes) {
            sorter = SortingCollection.newInstance(PairedReadSequence.class,
                    new PairedReadCodec(),
                    new PairedReadComparator(),
                    MAX_RECORDS_IN_RAM,
                    TMP_DIR);
        } else {
            sorter = SortingCollection.newInstance(PairedReadSequence.class,
                    new PairedReadWithBarcodesCodec(),
                    new PairedReadComparator(),
                    MAX_RECORDS_IN_RAM,
                    TMP_DIR);
        }

        // Loop through the input files and pick out the read sequences etc.
        final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
        for (final File f : INPUT) {
            final Map pendingByName = new HashMap();
            final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(f);
            readGroups.addAll(in.getFileHeader().getReadGroups());

            for (final SAMRecord rec : in) {
                if (!rec.getReadPairedFlag()) continue;
                if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) {
                    continue;
                }
                if (rec.isSecondaryOrSupplementary()) continue;

                PairedReadSequence prs = pendingByName.remove(rec.getReadName());
                if (prs == null) {
                    // Make a new paired read object and add RG and physical location information to it
                    prs = useBarcodes ? new PairedReadSequenceWithBarcodes() : new PairedReadSequence();
                    if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) {
                        final SAMReadGroupRecord rg = rec.getReadGroup();
                        if (rg != null) prs.setReadGroup((short) readGroups.indexOf(rg));
                    }

                    pendingByName.put(rec.getReadName(), prs);
                }

                // Read passes quality check if both ends meet the mean quality criteria
                final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(),
                        rec.getBaseQualities(),
                        MIN_IDENTICAL_BASES,
                        MIN_MEAN_QUALITY);
                prs.qualityOk = prs.qualityOk && passesQualityCheck;

                // Get the bases and restore them to their original orientation if necessary
                final byte[] bases = rec.getReadBases();
                if (rec.getReadNegativeStrandFlag()) SequenceUtil.reverseComplement(bases);

                final PairedReadSequenceWithBarcodes prsWithBarcodes = (useBarcodes) ? (PairedReadSequenceWithBarcodes) prs : null;

                if (rec.getFirstOfPairFlag()) {
                    prs.read1 = bases;
                    if (useBarcodes) {
                        prsWithBarcodes.barcode = getBarcodeValue(rec);
                        prsWithBarcodes.readOneBarcode = getReadOneBarcodeValue(rec);
                    }
                } else {
                    prs.read2 = bases;
                    if (useBarcodes) {
                        prsWithBarcodes.readTwoBarcode = getReadTwoBarcodeValue(rec);
                    }
                }

                if (prs.read1 != null && prs.read2 != null && prs.qualityOk) {
                    sorter.add(prs);
                }

                progress.record(rec);
            }
            CloserUtil.close(in);
        }

        log.info(String.format("Finished reading - read %d records - moving on to scanning for duplicates.", progress.getCount()));

        // Now go through the sorted reads and attempt to find duplicates
        final PeekableIterator iterator = new PeekableIterator(sorter.iterator());

        final Map> duplicationHistosByLibrary = new HashMap>();
        final Map> opticalHistosByLibrary = new HashMap>();

        int groupsProcessed = 0;
        long lastLogTime = System.currentTimeMillis();
        final int meanGroupSize = (int) (Math.max(1, (progress.getCount() / 2) / (int) pow(4, MIN_IDENTICAL_BASES * 2)));

        ElcDuplicatesFinderResolver algorithmResolver = new ElcDuplicatesFinderResolver(
                MAX_DIFF_RATE,
                MAX_READ_LENGTH,
                MIN_IDENTICAL_BASES,
                useBarcodes,
                opticalDuplicateFinder
        );

        while (iterator.hasNext()) {
            // Get the next group and split it apart by library
            final List group = getNextGroup(iterator);

            if (group.size() > meanGroupSize * MAX_GROUP_RATIO) {
                final PairedReadSequence prs = group.get(0);
                log.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " +
                        "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " +
                        StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) +
                        " / " +
                        StringUtil.bytesToString(prs.read2, 0, MIN_IDENTICAL_BASES));
            } else {
                final Map> sequencesByLibrary = splitByLibrary(group, readGroups);

                // Now process the reads by library
                for (final Map.Entry> entry : sequencesByLibrary.entrySet()) {
                    final String library = entry.getKey();
                    final List seqs = entry.getValue();

                    Histogram duplicationHisto = duplicationHistosByLibrary.get(library);
                    Histogram opticalHisto = opticalHistosByLibrary.get(library);
                    if (duplicationHisto == null) {
                        duplicationHisto = new Histogram<>("duplication_group_count", library);
                        opticalHisto = new Histogram<>("duplication_group_count", "optical_duplicates");
                        duplicationHistosByLibrary.put(library, duplicationHisto);
                        opticalHistosByLibrary.put(library, opticalHisto);
                    }

                    algorithmResolver.resolveAndSearch(seqs, duplicationHisto, opticalHisto);
                }

                ++groupsProcessed;
                if (lastLogTime < System.currentTimeMillis() - 60000) {
                    log.info("Processed " + groupsProcessed + " groups.");
                    lastLogTime = System.currentTimeMillis();
                }
            }
        }

        iterator.close();
        sorter.cleanup();

        final MetricsFile file = getMetricsFile();
        for (final String library : duplicationHistosByLibrary.keySet()) {
            final Histogram duplicationHisto = duplicationHistosByLibrary.get(library);
            final Histogram opticalHisto = opticalHistosByLibrary.get(library);
            final DuplicationMetrics metrics = new DuplicationMetrics();
            metrics.LIBRARY = library;

            // Filter out any bins that have fewer than MIN_GROUP_COUNT entries in them and calculate derived metrics
            for (final Integer bin : duplicationHisto.keySet()) {
                final double duplicateGroups = duplicationHisto.get(bin).getValue();
                final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue();

                if (duplicateGroups >= MIN_GROUP_COUNT) {
                    metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups);
                    metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups);
                    metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates;
                }
            }

            metrics.calculateDerivedFields();
            file.addMetric(metrics);
            file.addHistogram(duplicationHisto);

        }

        file.write(OUTPUT);

        return 0;
    }

    /**
     * Pulls out of the iterator the next group of reads that can be compared to each other to
     * identify duplicates.
     */
    List getNextGroup(final PeekableIterator iterator) {
        final List group = new ArrayList();
        final PairedReadSequence first = iterator.next();
        group.add(first);

        outer:
        while (iterator.hasNext()) {
            final PairedReadSequence next = iterator.peek();
            for (int i = 0; i < MIN_IDENTICAL_BASES; ++i) {
                if (first.read1[i] != next.read1[i] || first.read2[i] != next.read2[i]) break outer;
            }
            group.add(iterator.next());
        }
        return group;
    }

    /**
     * Takes a list of PairedReadSequence objects and splits them into lists by library.
     */
    Map> splitByLibrary(final List input,
                                                         final List rgs) {

        final Map> out = new HashMap<>();
        for (final PairedReadSequence seq : input) {
            String library;
            if (seq.getReadGroup() != -1) {
                library = rgs.get(seq.getReadGroup()).getLibrary();
                if (library == null) library = "Unknown";
            } else {
                library = "Unknown";
            }

            List librarySeqs = out.get(library);
            if (librarySeqs == null) {
                librarySeqs = new ArrayList<>();
                out.put(library, librarySeqs);
            }
            librarySeqs.add(seq);
        }
        return out;
    }

    /**
     * Checks that the average quality over the entire read is >= min, and that the first N bases do
     * not contain any no-calls.
     */
    boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int seedLength, final int minQuality) {
        if (bases.length < seedLength) return false;

        for (int i = 0; i < seedLength; ++i) {
            if (SequenceUtil.isNoCall(bases[i])) return false;
        }

        final int maxReadLength = (MAX_READ_LENGTH <= 0) ? Integer.MAX_VALUE : MAX_READ_LENGTH;
        final int readLength = Math.min(bases.length, maxReadLength);
        int total = 0;
        for (int i = 0; i < readLength; i++) total += quals[i];
        return total / readLength >= minQuality;
    }
}