Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright (c) 2009-2010 by The Broad Institute, Inc.
* All Rights Reserved.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL), Version 2.1 which
* is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*
* THE SOFTWARE IS PROVIDED "AS IS." THE BROAD AND MIT MAKE NO REPRESENTATIONS OR WARRANTIES OF
* ANY KIND CONCERNING THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT
* OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. IN NO EVENT SHALL THE BROAD OR MIT, OR THEIR
* RESPECTIVE TRUSTEES, DIRECTORS, OFFICERS, EMPLOYEES, AND AFFILIATES BE LIABLE FOR ANY DAMAGES OF
* ANY KIND, INCLUDING, WITHOUT LIMITATION, INCIDENTAL OR CONSEQUENTIAL DAMAGES, ECONOMIC
* DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER THE BROAD OR MIT SHALL
* BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE
* FOREGOING.
*/
package htsjdk.tribble.index.linear;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.index.AbstractIndex;
import htsjdk.tribble.index.Block;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.util.LittleEndianInputStream;
import htsjdk.tribble.util.LittleEndianOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.nio.file.Path;
import java.util.*;
/**
* Index defined by dividing the genome by chromosome, then each chromosome into bins of fixed width (in
* genomic coordinates). Features are allocated to bins by start position. The longest feature in each
* recorded and used to adjust the start position of a query to include all bins that might have a feature
* that overlaps the query interval. This works well for feature sets of approximately homogeneous length,
* or whose longest feature is on the order of the bin width or less.
*
* magicNumber integer
* type integer
* version integer
* filename null terminated character array
* filesize long
* lastModified long
* md5 String
* flags integer
*
* ------ LINEAR INDEX
* nChromosomes integer
*/
public class LinearIndex extends AbstractIndex {
// NOTE: To debug uncomment the System.getProperty and recompile.
public static final double MAX_FEATURES_PER_BIN = Double.parseDouble(System.getProperty("MAX_FEATURES_PER_BIN", "100"));
public static final int INDEX_TYPE = IndexType.LINEAR.fileHeaderTypeIdentifier;
private final static int MAX_BIN_WIDTH = 1 * 1000 * 1000 * 1000; // widths must be less than 1 billion
// 1MB: we will no merge bins with any features in them beyond this size, no matter how sparse, per chromosome
private static final long MAX_BIN_WIDTH_FOR_OCCUPIED_CHR_INDEX = Long.parseLong(System.getProperty("MAX_BIN_WIDTH_FOR_OCCUPIED_CHR_INDEX", "1024000"));
public static boolean enableAdaptiveIndexing = true;
/**
* Initialize using the specified {@code indices}
* @param indices
* @param featureFile
*/
public LinearIndex(final List indices, final Path featureFile) {
super(featureFile);
for (final ChrIndex index : indices)
chrIndices.put(index.getName(), index);
}
/**
* Initialize using the specified {@code indices}
* @param indices
* @param featureFile
*/
public LinearIndex(final List indices, final File featureFile) {
this(indices, IOUtil.toPath(featureFile));
}
private LinearIndex(final LinearIndex parent, final List indices) {
super(parent);
for (final ChrIndex index : indices)
chrIndices.put(index.getName(), index);
}
/**
* Initialize with default parameters
* @param featureFile File for which this is an index
*/
public LinearIndex(final String featureFile) {
super(featureFile);
}
/**
* Initialize with default parameters
* @param featurePath Path for which this is an index
*/
public LinearIndex(final Path featurePath) {
super(featurePath);
}
/**
* Load from file.
* @param inputStream This method assumes that the input stream is already buffered as appropriate.
*/
public LinearIndex(final InputStream inputStream) throws IOException {
final LittleEndianInputStream dis = new LittleEndianInputStream(inputStream);
validateIndexHeader(INDEX_TYPE, dis);
read(dis);
}
@Override
public boolean isCurrentVersion() {
if (!super.isCurrentVersion()) return false;
// todo fixme nasty hack to determine if this is an old style V3 linear index (without nFeaturesPerBin)
for (final htsjdk.tribble.index.ChrIndex chrIndex : chrIndices.values())
if (((ChrIndex) chrIndex).OLD_V3_INDEX)
return false;
return true;
}
@Override
protected int getType() {
return INDEX_TYPE;
}
@Override
public List getSequenceNames() {
return (chrIndices == null ? Collections.emptyList() :
Collections.unmodifiableList(new ArrayList<>(chrIndices.keySet())));
}
@Override
public Class getChrIndexClass() {
return ChrIndex.class;
}
/**
* Blocks are organized as a simple flat list:
*
* Block 0
* Block 1
* Block 2
*
* There's a constant bin width, so that each block corresponds to a specific interval
* over the genome based on its index, as in:
*
* Block 0: (0 - binWidth]
* Block 1: (binWidth - 2 * binWidth]
* Block 2: (2 * binWidth - 3 * binWidth]
*
* Note that covered regions are open on the left ( and closed on the right ].
*
* In general, if block i is the ith block (starting from 0), then block i
* contains all records that have starting position > (i * binWidth) and
* <= ((i + 1) * binWidth))
*/
public static class ChrIndex implements htsjdk.tribble.index.ChrIndex {
private String name = "";
private int binWidth;
private int longestFeature;
private int nFeatures;
private List blocks;
private boolean OLD_V3_INDEX = false;
/**
* Default constructor needed for factory methods -- DO NOT REMOVE
*/
public ChrIndex() {
}
ChrIndex(final String name, final int binWidth) {
this.name = name;
this.binWidth = binWidth;
this.blocks = new ArrayList(100);
this.longestFeature = 0;
//this.largestBlockSize = 0;
this.nFeatures = 0;
}
@Override
public String getName() {
return name;
}
void addBlock(final Block block) {
blocks.add(block);
//largestBlockSize = Math.max(largestBlockSize, block.getSize());
}
public int getNBlocks() {
return blocks.size();
}
@Override
public List getBlocks() {
return blocks;
}
@Override
public List getBlocks(final int start, final int end) {
if (blocks.isEmpty()) {
return Collections.emptyList();
} else {
// Adjust position for the longest feature in this chromosome. This insures we get
// features that start before the bin but extend into it
final int adjustedPosition = Math.max(start - longestFeature, 0);
final int startBinNumber = adjustedPosition / binWidth;
if (startBinNumber >= blocks.size()) // are we off the end of the bin list, so return nothing
return Collections.emptyList();
else {
final int endBinNumber = Math.min((end - 1) / binWidth, blocks.size() - 1);
// By definition blocks are adjacent for the liner index. Combine them into one merged block
final long startPos = blocks.get(startBinNumber).getStartPosition();
final long endPos = blocks.get(endBinNumber).getStartPosition() + blocks.get(endBinNumber).getSize();
final long size = endPos - startPos;
if (size == 0) {
return Collections.emptyList();
} else {
final Block mergedBlock = new Block(startPos, size);
return Collections.singletonList(mergedBlock);
}
}
}
}
public void updateLongestFeature(final int featureLength) {
longestFeature = Math.max(longestFeature, featureLength);
}
public int getNFeatures() {
return this.nFeatures;
}
public void incrementFeatureCount() {
this.nFeatures++;
}
@Override
public void write(final LittleEndianOutputStream dos) throws IOException {
// Chr name, binSize, # bins, longest feature
dos.writeString(name);
dos.writeInt(binWidth);
dos.writeInt(blocks.size());
dos.writeInt(longestFeature);
dos.writeInt(0); // no longer used
//dos.writeInt(largestBlockSize);
dos.writeInt(nFeatures);
long pos = 0;
long size = 0;
for (final Block block : blocks) {
pos = block.getStartPosition();
size = block.getSize();
dos.writeLong(pos);
}
// End of last block for this chromosome
dos.writeLong(pos + size);
}
@Override
public void read(final LittleEndianInputStream dis) throws IOException {
name = dis.readString();
binWidth = dis.readInt();
final int nBins = dis.readInt();
longestFeature = dis.readInt();
//largestBlockSize = dis.readInt();
// largestBlockSize and totalBlockSize are old V3 index values. largest block size should be 0 for
// all newer V3 block. This is a nasty hack that should be removed when we go to V4 (XML!) indices
OLD_V3_INDEX = dis.readInt() > 0;
nFeatures = dis.readInt();
// note the code below accounts for > 60% of the total time to read an index
blocks = new ArrayList(nBins);
long pos = dis.readLong();
for (int binNumber = 0; binNumber < nBins; binNumber++) {
final long nextPos = dis.readLong();
final long size = nextPos - pos;
blocks.add(new Block(pos, size));
pos = nextPos;
}
}
public boolean equals(final Object obj) {
if (this == obj) return true;
if (!(obj instanceof ChrIndex)) return false;
final ChrIndex other = (ChrIndex) obj;
return binWidth == other.binWidth
&& longestFeature == other.longestFeature
//&& largestBlockSize == other.largestBlockSize
&& nFeatures == other.nFeatures
&& name.equals(other.name)
&& blocks.equals(other.blocks);
}
@Override
public int hashCode() {
return Objects.hash(binWidth, longestFeature, nFeatures, name, blocks);
}
/**
* @return Total size of all blocks
*/
public long getTotalSize() {
long n = 0;
for (final Block b : getBlocks())
n += b.getSize();
return n;
}
public double getAverageFeatureSize() {
return (1.0 * getTotalSize()) / getNFeatures();
}
public double getFeaturesPerBlock() {
return (1.0 * getNFeatures()) / getNBlocks();
}
private double getNFeaturesOfMostDenseBlock(final double featureSize) {
double m = -1;
for (final Block b : getBlocks()) {
final double n = b.getSize() / featureSize;
if (m == -1 || n > m) m = n;
}
return m;
}
private double optimizeScore() {
return getNFeaturesOfMostDenseBlock(getAverageFeatureSize());
}
public ChrIndex optimize(final double threshold) {
return optimize(this, threshold, 0);
}
private static boolean badBinWidth(final ChrIndex idx) {
if (idx.binWidth > MAX_BIN_WIDTH || idx.binWidth < 0) // an overflow occurred
return true;
else if (MAX_BIN_WIDTH_FOR_OCCUPIED_CHR_INDEX != 0 && idx.getNFeatures() > 1 && idx.binWidth > MAX_BIN_WIDTH_FOR_OCCUPIED_CHR_INDEX) {
return true;
} else {
return false;
}
}
private static ChrIndex optimize(ChrIndex idx, final double threshold, int level) {
ChrIndex best = idx;
while (true) {
final double score = idx.optimizeScore();
if (score > threshold || idx.getNBlocks() == 1 || badBinWidth(idx))
break;
else {
best = idx; // remember the last best option
// try to make a better one
idx = mergeBlocks(idx);
level++;
}
if (level > 30) throw new IllegalStateException("Too many iterations");
}
return best;
}
private static ChrIndex mergeBlocks(final ChrIndex idx) {
final ChrIndex merged = new ChrIndex(idx.name, idx.binWidth * 2); // increasing width by 2 each time
merged.longestFeature = idx.longestFeature;
merged.nFeatures = idx.nFeatures;
final Iterator blocks = idx.getBlocks().iterator();
if (!blocks.hasNext())
throw new IllegalStateException("Block iterator cannot be empty at the start for " + idx.getName());
// extremely simple merging algorithm. Walk left to right, joining up blocks adjacent blocks.
while (blocks.hasNext()) {
final Block b1 = blocks.next();
final Block b2 = blocks.hasNext() ? blocks.next() : null;
if (b2 == null)
merged.addBlock(b1);
else
// the new block is simply the start of the first block and the size of both together
merged.addBlock(new Block(b1.getStartPosition(), b1.getSize() + b2.getSize()));
}
return merged;
}
private static String dupString(final char c, final int nCopies) {
final char[] chars = new char[nCopies];
Arrays.fill(chars, c);
return new String(chars);
}
}
/**
* Adapative optimization of the linear index
* @param threshold threshold to use for optimizing each constituent {@code chrIndex}
* @return The new optimized index
*/
public Index optimize(final double threshold) {
if (enableAdaptiveIndexing) {
final List newIndices = new ArrayList(this.chrIndices.size());
for (final String name : chrIndices.keySet()) {
final LinearIndex.ChrIndex oldIdx = (LinearIndex.ChrIndex) chrIndices.get(name);
final LinearIndex.ChrIndex newIdx = oldIdx.optimize(threshold);
newIndices.add(newIdx);
}
return new LinearIndex(this, newIndices);
} else {
return this;
}
}
public Index optimize() {
return optimize(MAX_FEATURES_PER_BIN);
}
/**
* Code to convert linear index to a text table for analysis
* @param out Stream to which to write out table to
*/
public void writeTable(final PrintStream out) {
out.printf("chr binWidth avg.feature.size nFeatures.total block.id start.pos size nFeatures%n");
for (final String name : chrIndices.keySet()) {
final LinearIndex.ChrIndex chrIdx = (LinearIndex.ChrIndex) chrIndices.get(name);
int blockCount = 0;
for (final Block b : chrIdx.getBlocks()) {
out.printf("%s %d %.2f %d %d %d %d %d%n", name, chrIdx.binWidth, chrIdx.getAverageFeatureSize(), chrIdx.getNFeatures(), blockCount,
blockCount * chrIdx.binWidth, b.getSize(), (int) (b.getSize() / chrIdx.getAverageFeatureSize()));
blockCount++;
}
}
}
// purely for testing purposes
protected final void setTS(final long ts) {
this.indexedFileTS = ts;
}
}