All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.util.GZIIndex Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2017 Daniel Gomez-Sanchez
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package htsjdk.samtools.util;

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.ByteChannel;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

/**
 * Represents a .gzi index of a block-compressed file.
 *
 * 

The .gzi index is a mapping between the offset of each block in the gzipped file and the * uncompressed offset that that block starts with. This mapping is represented by {@link IndexEntry}. * *

An example of usage for this index for random access a bgzip file using an index generated * from raw data. For example, for indexing a compressed FASTA file the .gzi index can be used in * conjunction with the {@link htsjdk.samtools.reference.FastaSequenceIndex} to seek concrete * sequences. * * @author Daniel Gomez-Sanchez (magicDGS) * @see https://github.com/samtools/htslib/issues/473 */ public final class GZIIndex { /** Default extension for the files storing a {@link GZIIndex}. */ /** * @deprecated since June 2019 Use {@link FileExtensions#GZI} instead. */ @Deprecated public static final String DEFAULT_EXTENSION = FileExtensions.GZI; /** * Index entry mapping the block-offset (compressed offset) to the uncompressed offset where the * block starts. * * @see https://github.com/samtools/htslib/issues/473 */ public static final class IndexEntry { private final long compressedOffset; private final long uncompressedOffset; private IndexEntry(final long compressedOffset, final long uncompressedOffset) { if (compressedOffset < 0) { throw new IllegalArgumentException("negative compressed offset: " + compressedOffset); } if (uncompressedOffset < 0) { throw new IllegalArgumentException("negative uncompressed offset: " + uncompressedOffset); } this.compressedOffset = compressedOffset; this.uncompressedOffset = uncompressedOffset; } /** Returns the compressed offset (block-offset). */ public long getCompressedOffset() { return compressedOffset; } /** Returns the uncompressed offset. */ public long getUncompressedOffset() { return uncompressedOffset; } @Override public String toString() { return String.format("IndexEntry={compressed=%d(0x%x),uncompressed=%d(0x%x)", compressedOffset, compressedOffset, uncompressedOffset, uncompressedOffset); } @Override public boolean equals(final Object obj) { if (obj == null || !(obj instanceof IndexEntry)) { return false; } final IndexEntry other = (IndexEntry) obj; return compressedOffset == other.compressedOffset && uncompressedOffset == other.uncompressedOffset; } @Override public int hashCode() { return 31 * Long.hashCode(compressedOffset) + Long.hashCode(uncompressedOffset); } } private final List entries; // private constructors private GZIIndex(final List entries) { this.entries = Collections.unmodifiableList(entries); } /** * Gets the number of blocks on the file. * * @return the number of blocks. */ public int getNumberOfBlocks() { // +1 because the first block is not included in the entry list return entries.size() + 1; } /** * Gets an unmodifiable list with the index entries. * *

Note: because the first block corresponds to a dummy index entry (0, 0), the returned list * does not include it. Thus, the size of the list is {@code getNumberOfBlocks() - 1}. * * @return index entries. */ public List getIndexEntries() { return entries; } /** * Gets the virtual offset for seek with {@link BlockCompressedInputStream#seek(long)}. * *

{@link BlockCompressedInputStream#seek(long)} parameter is not a byte-offset, but a * special virtual file pointer that specifies the block-offset within the file * ({@link BlockCompressedFilePointerUtil#getBlockAddress(long)}), and the offset within the * block ({@link BlockCompressedFilePointerUtil#getBlockOffset(long)}). * *

This methods converts the provided byte-offset on the file to the special file pointer * used to seek a block-compressed file, using this index to find the block where the * byte-offset is located. * * @param uncompressedOffset the file-offset. * * @return virtual offset for {@link BlockCompressedInputStream}. * @see BlockCompressedFilePointerUtil */ public long getVirtualOffsetForSeek(final long uncompressedOffset) { try { if (uncompressedOffset == 0) { return BlockCompressedFilePointerUtil.makeFilePointer(0, 0); } // binary search in the entries for the uncompressed offset final int pos = Collections.binarySearch(entries, // this is a fake index for getting the uncompressed offsets new IndexEntry(0, uncompressedOffset), Comparator.comparingLong(IndexEntry::getUncompressedOffset)); // if it is found, it is at the beginning of the block if (pos >= 0) { return BlockCompressedFilePointerUtil.makeFilePointer( entries.get(pos).getCompressedOffset(), // offset in the block is always 0 (beginning of the block) 0); } // if pos < 0, then the offset is in the previous block to the insertion point // the insertion_point == -pos - 1 // previous block = inserion_point - 1 final int entryPos = -pos - 2; // if the insertion point is -1, it means that it is in the first block // so the virtual offset is just the uncompressed offset on the first block if (entryPos == -1) { return BlockCompressedFilePointerUtil.makeFilePointer(0, Math.toIntExact(uncompressedOffset)); } final IndexEntry indexEntry = entries.get(entryPos); // now we should convert the uncompressed offste to an offset within the block final int blockOffset = Math.toIntExact(uncompressedOffset - indexEntry.getUncompressedOffset()); // we use the file pointer utils to convert to the virtual-offset representation return BlockCompressedFilePointerUtil.makeFilePointer(indexEntry.getCompressedOffset(), blockOffset); } catch (ArithmeticException e) { throw new IllegalArgumentException("Cannot handle offsets within blocks larger than " + Integer.MAX_VALUE, e); } } @Override public boolean equals(final Object obj) { if (obj == null || !(obj instanceof GZIIndex)) { return false; } return this.entries.equals(((GZIIndex) obj).entries); } @Override public int hashCode() { return entries.hashCode(); } @Override public String toString() { return "GZIIndex:" + StringUtil.join(", ", entries); } /** * Writes this index into the requested path. * * @param output the output path. * * @throws IOException if an I/O error occurs. */ public void writeIndex(final Path output) throws IOException { writeIndex(Files.newOutputStream(output)); } /** * Writes this index into the requested path. * * NOTE: This method will close out the provided output stream when it finishes writing the index * * @param output the output file. * * @throws IOException if an I/O error occurs. */ public void writeIndex(final OutputStream output) throws IOException { if (output == null) { throw new IllegalArgumentException("null output path"); } BinaryCodec codec = new BinaryCodec(output); // the first entry is never written - 0, 0 final int numberOfBlocksToWrite = entries.size(); // put the number of entries codec.writeLong(Integer.toUnsignedLong(numberOfBlocksToWrite)); // except the first, iterate over all the entries for (final IndexEntry entry : entries) { // implementation of entry ensures that the offsets are no negative codec.writeLong(entry.getCompressedOffset()); codec.writeLong(entry.getUncompressedOffset()); } // Close the codec to ensure the output is written codec.close(); } /** * Loads the index from the provided file. * * @param indexPath the path for the index to load. * * @return loaded index. * * @throws IOException if an I/O error occurs. */ public static final GZIIndex loadIndex(final Path indexPath) throws IOException { if (indexPath == null) { throw new IllegalArgumentException("null input path"); } try (final ReadableByteChannel channel = Files.newByteChannel(indexPath)) { return loadIndex(indexPath.toUri().toString(), channel); } } /** * Loads the index from the provided input stream. * * @param source The named source of the reference file (used in error messages). May be null if unknown. * @param indexIn the input stream for the index to load. * * @return loaded index. * * @throws IOException if an I/O error occurs. */ public static final GZIIndex loadIndex(final String source, final InputStream indexIn) throws IOException { if (indexIn == null) { throw new IllegalArgumentException("null input stream"); } try (final ReadableByteChannel channel = Channels.newChannel(indexIn)) { return loadIndex(source, channel); } } /** * Loads the index from the provided channel. * * @param source The named source of the reference file (used in error messages). May be null if unknown. * @param channel the channel to read the index from. * * @return loaded index. * * @throws IOException if an I/O error occurs. */ public static final GZIIndex loadIndex(final String source, final ReadableByteChannel channel) throws IOException { // allocate a buffer for re-use for read each byte ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); buffer.order(ByteOrder.LITTLE_ENDIAN); if (Long.BYTES != channel.read(buffer)) { throw getCorruptedIndexException(source, "less than " + Long.BYTES+ "bytes", null); } buffer.flip(); final int numberOfEntries; try { numberOfEntries = Math.toIntExact(buffer.getLong()); } catch (ArithmeticException e) { buffer.flip(); throw getCorruptedIndexException(source, String.format("HTSJDK cannot handle more than %d entries in .gzi index, but found %s", Integer.MAX_VALUE, buffer.getLong()), e); } // allocate array with the entries and add the first one final List entries = new ArrayList<>(numberOfEntries); // create a new buffer with the correct size and read into it buffer = allocateBuffer(numberOfEntries, false); channel.read(buffer); buffer.flip(); for (int i = 0; i < numberOfEntries; i++) { final IndexEntry entry; try { entry = new IndexEntry(buffer.getLong(), buffer.getLong()); } catch (IllegalArgumentException e) { throw getCorruptedIndexException(source, e.getMessage(), e); } // check if the entry is increasing in order if (i == 0) { if (entry.getUncompressedOffset() == 0 && entry.getCompressedOffset() == 0) { throw getCorruptedIndexException(source, "first block index entry should not be present", null); } } else if (entries.get(i - 1).getCompressedOffset() >= entry.getCompressedOffset() || entries.get(i - 1).getUncompressedOffset() >= entry.getUncompressedOffset()) { throw getCorruptedIndexException(source, String.format("index entries in misplaced order - %s vs %s", entries.get(i - 1), entry), null); } entries.add(entry); } return new GZIIndex(entries); } private static final IOException getCorruptedIndexException(final String source, final String msg, final Exception e) { return new IOException(String.format("Corrupted index file: %s (%s)", msg, source == null ? "unknown" : source), e); } /** * Builds a {@link GZIIndex} on the fly from a BGZIP file. * *

Note that this method does not write the index on disk. Use {@link #writeIndex(OutputStream)} on * the returned object to save the index. * * @param bgzipFile the bgzip file. * * @return in-memory .gzi index. * * @throws IOException if an I/O error occurs. */ public static final GZIIndex buildIndex(final Path bgzipFile) throws IOException { if (bgzipFile == null) { throw new IllegalArgumentException("null input path"); } // open the file for reading as a block-compressed file try (final BlockCompressedInputStream bgzipStream = new BlockCompressedInputStream(Files.newInputStream(bgzipFile))) { // store the entries as a list final List entries = new ArrayList<>(); // accumulator for number of bytes read to use in the offset for the index-entry int currentOffset = 0; // until the end of the stream while (bgzipStream.read() != -1) { currentOffset++; // if we are at the end of the block if (bgzipStream.endOfBlock()) { // gets the block address (compressed offset) - requires to parse with the file pointer utils final long compressed = BlockCompressedFilePointerUtil.getBlockAddress(bgzipStream.getFilePointer()); // add a new IndexEntry entries.add(new IndexEntry(compressed, currentOffset)); } } return new GZIIndex(entries); } } /** * Creates a {@link GZIIndex} from a BGZIP file and store it in memory and disk. * * @param bgzipFile the bgzip file. * @param overwrite if the .fai index already exists override it if {@code true}; otherwise, throws a {@link IOException}. * * @return the in-memory representation for the created index. * @throws IOException if an IO error occurs. */ public static GZIIndex createIndex(final Path bgzipFile, final boolean overwrite) throws IOException { final Path indexFile = resolveIndexNameForBgzipFile(bgzipFile); if (!overwrite && Files.exists(indexFile)) { // throw an exception if the file already exists throw new IOException("Index file " + indexFile + " already exists for " + bgzipFile); } // build the index, write and return final GZIIndex index = buildIndex(bgzipFile); index.writeIndex(new BufferedOutputStream(Files.newOutputStream(indexFile))); return index; } /** Gets the default index path for the bgzip file. */ public static Path resolveIndexNameForBgzipFile(final Path bgzipFile) { return bgzipFile.resolveSibling(bgzipFile.getFileName().toString() + FileExtensions.GZI); } // helper method for allocate a buffer for read/write private static final ByteBuffer allocateBuffer(final int numberOfEntries, final boolean includeNumberOfEntries) { // everything is encoded as an unsigned long int size = (includeNumberOfEntries) ? Long.BYTES : 0; size += numberOfEntries * 2 * Long.BYTES; // creates a byte buffer in little-endian return ByteBuffer.allocate(size).order(ByteOrder.LITTLE_ENDIAN); } /** * Helper class for constructing the GZIindex. * * In order to construct a GZI index addGzipBlock() should be called every time a new Gzip Block is written out and * the entire index will be written out when close() is called. */ public static final class GZIIndexer implements Closeable { private int uncompressedFileOffset; private final OutputStream output; private final List entries = new ArrayList<>(); public GZIIndexer(final OutputStream outputStream) { output = outputStream; } public GZIIndexer(final Path outputFile) throws IOException { output = Files.newOutputStream(outputFile); } // Adds a new index location given the compressed file offset and a running tally based on the uncompressed block sizes public void addGzipBlock(final long compressedFileOffset, final long uncompressedBlockSize) { IndexEntry indexEntry = new IndexEntry(compressedFileOffset, uncompressedFileOffset); uncompressedFileOffset += uncompressedBlockSize; entries.add(indexEntry); } @Override public void close() throws IOException { GZIIndex index = new GZIIndex(entries); index.writeIndex(output); //NOTE this relies on writeIndex closing the output stream for it } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy