All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.tribble.index.tabix.TabixIndexCreator Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2014 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.tribble.index.tabix;

import htsjdk.samtools.BinningIndexBuilder;
import htsjdk.samtools.BinningIndexContent;
import htsjdk.samtools.Chunk;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.Feature;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.index.IndexCreator;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * IndexCreator for Tabix.
 * Features are expected to be 1-based, inclusive.
 */
public class TabixIndexCreator implements IndexCreator {
    private final TabixFormat formatSpec;
    private final List indexContents = new ArrayList();
    private final List sequenceNames = new ArrayList();
    // Merely a faster way to ensure that features are added in a specific sequence name order
    private final Set sequenceNamesSeen = new HashSet();
    // A sequence dictionary is not required, but if it is provided all sequences names must be present in it.
    // It is used to determine the length of a sequence in order to optimize index memory allocation.
    private final SAMSequenceDictionary sequenceDictionary;

    private String currentSequenceName = null;
    private BinningIndexBuilder indexBuilder = null;
    // A feature can't be added to the index until the next feature is added because the next feature
    // defines the location of the end of the previous feature in the output file.
    private TabixFeature previousFeature = null;


    /**
     * @param sequenceDictionary is not required, but if present all features added must refer to sequences in the
     *                           dictionary.  It is used to optimize the memory needed to build the index.
     */
    public TabixIndexCreator(final SAMSequenceDictionary sequenceDictionary,
                             final TabixFormat formatSpec) {
        this.sequenceDictionary = sequenceDictionary;
        this.formatSpec = formatSpec.clone();
    }

    public TabixIndexCreator(final TabixFormat formatSpec) {
        this(null, formatSpec);
    }

    @Override
    public void addFeature(final Feature feature, final long filePosition) {
        final String sequenceName = feature.getContig();
        final int referenceIndex;
        if (sequenceName.equals(currentSequenceName)) {
            referenceIndex = sequenceNames.size() - 1;
        } else {
            referenceIndex = sequenceNames.size();
            if (currentSequenceName != null && sequenceNamesSeen.contains(sequenceName)) {
                throw new IllegalArgumentException("Sequence " + feature + " added out sequence of order");
            }
        }
        final TabixFeature thisFeature = new TabixFeature(referenceIndex, feature.getStart(), feature.getEnd(), filePosition);
        if (previousFeature != null) {
            if (previousFeature.compareTo(thisFeature) > 0) {
                throw new IllegalArgumentException(String.format("Features added out of order: previous (%s) > next (%s)",
                        previousFeature, thisFeature));
            }
            finalizeFeature(filePosition);
        }
        previousFeature = thisFeature;
        if (referenceIndex == sequenceNames.size()) {
            advanceToReference(sequenceName);
        }
    }

    private void finalizeFeature(final long featureEndPosition) {
        previousFeature.featureEndFilePosition = featureEndPosition;
        if (previousFeature.featureStartFilePosition >= previousFeature.featureEndFilePosition) {
            throw new IllegalArgumentException(String.format("Feature start position %d >= feature end position %d",
                    previousFeature.featureStartFilePosition, previousFeature.featureEndFilePosition));
        }
        indexBuilder.processFeature(previousFeature);
    }

    private void advanceToReference(final String sequenceName) {
        if (indexBuilder != null) {
            indexContents.add(indexBuilder.generateIndexContent());
        }
        // If sequence dictionary is provided, BinningIndexBuilder can reduce size of array it allocates.
        final int sequenceLength;
        if (sequenceDictionary != null) {
            sequenceLength = sequenceDictionary.getSequence(sequenceName).getSequenceLength();
        } else {
            sequenceLength = 0;
        }
        indexBuilder = new BinningIndexBuilder(sequenceNames.size(), sequenceLength);
        sequenceNames.add(sequenceName);
        currentSequenceName = sequenceName;
        sequenceNamesSeen.add(sequenceName);
    }

    @Override
    public Index finalizeIndex(final long finalFilePosition) {
        if (previousFeature != null) {
            finalizeFeature(finalFilePosition);
        }
        if (indexBuilder != null) {
            indexContents.add(indexBuilder.generateIndexContent());
        }
        // Make this as big as the sequence dictionary, even if there is not content for every sequence,
        // but truncate the sequence dictionary before its end if there are sequences in the sequence dictionary without
        // any features.
        final BinningIndexContent[] indices = indexContents.toArray(new BinningIndexContent[sequenceNames.size()]);
        return new TabixIndex(formatSpec, sequenceNames, indices);
    }


    private static class TabixFeature implements BinningIndexBuilder.FeatureToBeIndexed, Comparable {
        private final int referenceIndex;
        private final int start;
        private final int end;
        private final long featureStartFilePosition;
        // Position after this feature in the file.
        private long featureEndFilePosition = -1;

        private TabixFeature(final int referenceIndex, final int start, final int end, final long featureStartFilePosition) {
            this.referenceIndex = referenceIndex;
            this.start = start;
            this.end = end;
            this.featureStartFilePosition = featureStartFilePosition;
        }

        @Override
        public int getStart() {
            return start;
        }

        @Override
        public int getEnd() {
            return end;
        }

        /**
         *
         * @return null -- Let index builder compute this.
         */
        @Override
        public Integer getIndexingBin() {
            return null;
        }

        @Override
        public Chunk getChunk() {
            if (featureEndFilePosition == -1) {
                throw new IllegalStateException("End position is not set");
            }
            return new Chunk(featureStartFilePosition, featureEndFilePosition);
        }

        @Override
        public int compareTo(final TabixFeature other) {
            final int ret = this.referenceIndex - other.referenceIndex;
            if (ret != 0) return ret;
            return this.start - other.start;
        }

        @Override
        public String toString() {
            return "TabixFeature{" +
                    "referenceIndex=" + referenceIndex +
                    ", start=" + start +
                    ", end=" + end +
                    ", featureStartFilePosition=" + featureStartFilePosition +
                    ", featureEndFilePosition=" + featureEndFilePosition +
                    '}';
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy