All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.reference.FastaSequenceIndex Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package htsjdk.samtools.reference;

import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.IOUtil;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.regex.MatchResult;

/**
 * Reads/writes a fasta index file (.fai), as generated by `samtools faidx`.
 */
public class FastaSequenceIndex implements Iterable {
    /**
     * Store the entries.  Use a LinkedHashMap for consistent iteration in insertion order.
     */
    private final Map sequenceEntries = new LinkedHashMap();

    /**
     * Build a sequence index from the specified file.
     * @param indexFile File to open.
     * @throws FileNotFoundException if the index file cannot be found.
     */
    public FastaSequenceIndex( File indexFile ) {
        this(IOUtil.toPath(indexFile));
    }

    /**
     * Build a sequence index from the specified file.
     * @param indexFile File to open.
     * @throws FileNotFoundException if the index file cannot be found.
     */
    public FastaSequenceIndex( Path indexFile ) {
        IOUtil.assertFileIsReadable(indexFile);
        try (InputStream in = Files.newInputStream(indexFile)) {
            parseIndexFile(in);
        } catch (IOException e) {
            throw new SAMException("Fasta index file could not be opened: " + indexFile, e);
        }
    }

    /**
     * Build a sequence index from the specified input stream.
     * @param in InputStream to read from.
     */
    public FastaSequenceIndex(InputStream in) {
        parseIndexFile(in);
    }

    /**
     * Empty, protected constructor for unit testing.
     */
    protected FastaSequenceIndex() {}

    /**
     * Add a new index entry to the list.  Protected for unit testing.
     * @param indexEntry New index entry to add.
     */
    protected void add(FastaSequenceIndexEntry indexEntry) {
        final FastaSequenceIndexEntry ret = sequenceEntries.put(indexEntry.getContig(),indexEntry);
        if (ret != null) {
            throw new SAMException("Contig '" + indexEntry.getContig() + "' already exists in fasta index.");
        }
    }

    /**
     * Renames the existing index entry to the new index entry with the specified name.
     * @param entry entry to update.
     * @param newName New name for the index entry.
     */
    protected void rename(FastaSequenceIndexEntry entry,String newName) {
        sequenceEntries.remove(entry.getContig());
        entry.setContig(newName);
        add(entry);
    }

    /**
     * Compare two FastaSequenceIndex objects for equality.
     * @param other Another FastaSequenceIndex to compare
     * @return True if index has the same entries as other instance, in the same order
     */
    public boolean equals(Object other) {
        if(!(other instanceof FastaSequenceIndex))
            return false;

        if (this == other) return true;

        FastaSequenceIndex otherIndex = (FastaSequenceIndex)other;
        if(this.size() != otherIndex.size())
            return false;

        Iterator iter = this.iterator();
        Iterator otherIter = otherIndex.iterator();
        while (iter.hasNext()) {
            if (!otherIter.hasNext())
                return false;
            if (!iter.next().equals(otherIter.next()))
                return false;
        }
        return true;
    }

    /**
     * Parse the contents of an index file, caching the results internally.
     * @param in InputStream to parse.
     */
    private void parseIndexFile(InputStream in) {
        try (Scanner scanner = new Scanner(in)) {
            int sequenceIndex = 0;
            while( scanner.hasNext() ) {
                // Tokenize and validate the index line.
                String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
                if( result == null )
                    throw new SAMException("Found invalid line in index file:" + scanner.nextLine());
                MatchResult tokens = scanner.match();
                if( tokens.groupCount() != 5 )
                    throw new SAMException("Found invalid line in index file:" + scanner.nextLine());

                // Skip past the line separator
                scanner.nextLine();

                // Parse the index line.
                String contig = tokens.group(1);
                long size = Long.valueOf(tokens.group(2));
                long location = Long.valueOf(tokens.group(3));
                int basesPerLine = Integer.valueOf(tokens.group(4));
                int bytesPerLine = Integer.valueOf(tokens.group(5));

                contig = SAMSequenceRecord.truncateSequenceName(contig);
                // Build sequence structure
                add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) );
            }
        }
    }

    /**
     * Writes this index to the specified path.
     *
     * @param indexFile index file to output the index in the .fai format
     *
     * @throws IOException if an IO error occurs.
     */
    public void write(final Path indexFile) throws IOException {
        try (final PrintStream writer = new PrintStream(Files.newOutputStream(indexFile))) {
            sequenceEntries.values().forEach(se ->
                    writer.println(String.join("\t",
                            se.getContig(),
                            String.valueOf(se.getSize()),
                            String.valueOf(se.getLocation()),
                            String.valueOf(se.getBasesPerLine()),
                            String.valueOf(se.getBytesPerLine()))
                    )
            );
        }
    }

    /**
     * Does the given contig name have a corresponding entry?
     * @param contigName The contig name for which to search.
     * @return True if contig name is present; false otherwise.
     */
    public boolean hasIndexEntry( String contigName ) {
        return sequenceEntries.containsKey(contigName);
    }

    /**
     * Retrieve the index entry associated with the given contig.
     * @param contigName Name of the contig for which to search.
     * @return Index entry associated with the given contig.
     * @throws SAMException if the associated index entry can't be found.
     */
    public FastaSequenceIndexEntry getIndexEntry( String contigName ) {
        if( !hasIndexEntry(contigName) )
            throw new SAMException("Unable to find entry for contig: " + contigName);

        return sequenceEntries.get(contigName);
    }

    /**
     * Creates an iterator which can iterate through all entries in a fasta index.
     * @return iterator over all fasta index entries.
     */
    @Override
    public Iterator iterator() {
        return sequenceEntries.values().iterator();
    }

    /**
     * Returns the number of elements in the index.
     * @return Number of elements in the index.
     */
    public int size() {
        return sequenceEntries.size();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy