org.broad.igv.bbfile.BPTree Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
/*
 * Copyright (c) 2007-2011 by The Broad Institute of MIT and Harvard.  All Rights Reserved.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 *
 * THE SOFTWARE IS PROVIDED "AS IS." THE BROAD AND MIT MAKE NO REPRESENTATIONS OR
 * WARRANTES OF ANY KIND CONCERNING THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING,
 * WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER
 * OR NOT DISCOVERABLE.  IN NO EVENT SHALL THE BROAD OR MIT, OR THEIR RESPECTIVE
 * TRUSTEES, DIRECTORS, OFFICERS, EMPLOYEES, AND AFFILIATES BE LIABLE FOR ANY DAMAGES
 * OF ANY KIND, INCLUDING, WITHOUT LIMITATION, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
 * ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER
 * THE BROAD OR MIT SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT
 * SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
 */

package org.broad.igv.bbfile;

import org.apache.log4j.Logger;
import org.broad.tribble.util.SeekableStream;
import org.broad.tribble.util.LittleEndianInputStream;

import java.io.DataInputStream;
import java.io.IOException;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

/**
 * Created by IntelliJ IDEA.
 * User: martind
 * Date: Dec 17, 2009
 * Time: 12:28:30 PM
 * To change this template use File | Settings | File Templates.
 */
/*
*   B+ Tree class will construct a B+ tree from a binary Bed/Wig BBFile.
*   (or by insertion of tree nodes - TBD see insert method)
*
*   1) BPTree will first read in the B+ tree header with BPTreeHeader class.
*
*   2) Starting with the root node, the readBPTreeNode method will read in the
*   node format, determine if the node contains child nodes (isLeaf = false)
*   or leaf items (isLeaf = true).
*
*   3) If node is a leaf node, all leaf items are read in to the node's leaf array.
*
*   4) If node is a child node, readBPTreeNode will be called recursively,
*   until the leaf node is encountered, where step 3 is performed.
*
*   5) The child nodes will be populated with their child node items in reverse order
*   of recursion from step 4, until the tree is completely populated
*   back up to the root node.
*
*   6) The getChromosomeKey is provided to construct a valid key for B+
*   chromosome tree searches, and getChromosomeID returns a chromosome ID for
*   searches in the R+ index tree.
*
**/ 
public class BPTree {

    private static Logger log = Logger.getLogger(BPTree.class);

    public static final int BPTREE_NODE_FORMAT_SIZE = 4;   // node format size
    public static final int BPTREE_NODE_ITEM_SIZE = 8;     // Plus keySize to be added

    // B+ tree access variables   - for reading in B+ tree nodes from a file
    private SeekableStream fis;      // file handle - BBFile input stream
    private long treeOffset;         // mChromosome B+ tree file offset
    private BPTreeHeader treeHeader; // B+ tree header (Table E for BBFile)

    // B+ tree organizational variables  - derived from Table E
    private int blockSize;     // number of children per block
    private int keySize;       // character size of primary key
    private int valueSize;     // number of bytes in value being indexed
    private long itemCount;    //  number of contig/mChromosome items in tree

    // B+ tree nodal variables
    private BPTreeNode rootNode;   // B+ tree root node
    private long nodeCount;        // number of nodes defined in the B+ tree
    private long leafCount;        // number of leaves in the B+ tree

   /*
   *    Constructor for reading in a B+ tree from a BBFile/input stream.
   *
   *    Parameters:
   *        fis - file input stream handle
   *        fileOffset - file offset to the B+ tree header
   *        isLowToHigh - indicates byte order is low to high, else is high to low
   * */
    public BPTree(SeekableStream fis, long fileOffset, boolean isLowToHigh) {

        // Save the seekable file handle and B+ Tree file offset
        // Note: the offset is the B+ Tree Header Table E file location
        this.fis = fis;
        treeOffset = fileOffset;

        // read in B+ tree header - verify the B+ tree info exits
        treeHeader = new BPTreeHeader(this.fis, treeOffset, isLowToHigh);

        // log error if header not found and throw exception
        if(!treeHeader.isHeaderOK()){
            int badMagic = treeHeader.getMagic();
            log.error("Error reading B+ tree header: bad magic = " + badMagic);
            throw new RuntimeException("Error reading B+ tree header: bad magic = "
                    +  badMagic);
        }

        // assign B+ tree specifications from the header
        blockSize = treeHeader.getBlockSize();
        keySize =  treeHeader.getKeySize();
        valueSize = treeHeader.getValSize();
        itemCount = treeHeader.getItemCount();

        // populate the tree - read in the nodes
        long nodeOffset = treeOffset + treeHeader.BPTREE_HEADER_SIZE;
        BPTreeNode parentNode = null;  // parent node of the root is itself, or null

        // get the root node - which recursively populates the remaining nodes
        rootNode =  readBPTreeNode(this.fis, nodeOffset, parentNode, isLowToHigh);

    }

    /*
    *   Method returns the file input stream handle
    * */
    public SeekableStream getFis() {
        return fis;
    }

    /*
    *   Method returns the B+ tree file location
    * */
    public long getBPTreeOffset() {
        return treeOffset;
    }

    /*
    *   Method returns the B+ tree header (Table E).
    * */
    public BPTreeHeader getTreeHeader(){
        return treeHeader;
    }

    /*
    *   Method returns the node block size (B+ order).
    * */
    public int getBlockSize() {
        return blockSize;
    }

    /*
    *   Method returns the chromosome name key size, which is
    *   the number of valid characters for chromosome name.
    * */
    public int getKeySize() {
        return keySize;
    }

    /*
    *   Method returns the indexing value size (currently 8).
    * */
    public int getValueSize() {
          return valueSize;
    }

    /*
    *   Method returns the number of chromosome/contig names.
    * */
    public long getItemCount() {
        return itemCount;
    }

    /*
    *   Method returns the number of nodes in the B+ tree.
    * */
    public long getNodeCount() {
        return nodeCount;
    }

    /*
    *   Method returns the root node, from which all other nodes
    *   can be extracted.
    *
    *   Returns:
    *       Root node
    * */
    public BPTreeNode getRootNode() {
        return rootNode;
    }


    Map chromosomeKeyCache = new HashMap();
    /*
    *   Returns a search key for the mChromosome region  which  can
    *   be used to search for a corresponding section in the B+ tree.
    *
    *   According the the spec the key is the "first keySize characters of chromosome name, padded with zeroes if needed.
    * */
    public String getChromosomeKey(String chromosome) {

        String key = chromosomeKeyCache.get(chromosome);
        if(key == null) {
            char [] keyChars = new char[keySize];
            char [] chrChars = chromosome.toCharArray();
            System.arraycopy(chrChars, 0, keyChars, 0, Math.min(keySize, chrChars.length));
            key = new String(keyChars);
            chromosomeKeyCache.put(chromosome, key);
        }
        return key;
    }

    /*
    *   Returns a chromosome ID  which  can be used to search for a
    *   corresponding data section in the R+ tree for data.
    *
       Parameters:
    *       chromKey - chromosome name of valid key size.
    *
    *
    *   Note: A chromosomeID of -1 means chromosome name not included in B+ tree.
    *
    * */
    public int getChromosomeID(String chromKey) {
         int chromosomeID;

        // Search the B+ tree to extract the Chromosome ID.
        BPTreeNode thisNode = rootNode;

        chromosomeID = findChromosomeID(thisNode, chromKey);

        return chromosomeID;
    }

    /*
    *   Returns a chromosome name which is the B+ key for returning the
    *   chromosome ID for lookup in the R+ tree for data.
    *
    *   Parameters:
    *       chromID - chromosome ID expected in B+ tree
    *
    *   Returns:
    *       Chromosome name key; a null string means chromosome ID not found.
    *
    * */
    public String getChromosomeName(int chromID) {
         String chromKey;

        // Search the B+ tree to extract the Chromosome ID.
        BPTreeNode thisNode = rootNode;

        chromKey = findChromosomeName(thisNode, chromID);

        return chromKey;
    }

    /*
    *   Method returns all chromosome key names in B+ tree.
    *
    *   Returns:
    *   Collection of all (chromosome ID, chromosome name)entries
    * */
    public ArrayList getChromosomeNames(){

        // Search the B+ tree to extract the chromosome ID.
        BPTreeNode thisNode = rootNode;

        ArrayList chromosomeList = new ArrayList();

        findAllChromosomeNames(thisNode, chromosomeList);

        return chromosomeList;
    }

     /*
    *   Method returns all chromosome name, chromosome ID pairs for a given ID range.
    *
    *   Parameters:
    *       startChromID - starting ID for chromosome range expected in B+ tree
    *       endChromID - ending ID for chromosome range expected in B+ tree
    *
    *   Returns:
    *       Collection of (chromosome ID, chromosome name key) hash items;
    *       where an empty collection means ID range was not found.
    *
    * */
    public HashMap getChromosomeIDMap(int startChromID, int endChromID){

        // Search the B+ tree to extract the chromosome ID.
        BPTreeNode thisNode = rootNode;

        HashMap chromosomeIDMap = new HashMap();

        findChromosomeMap(thisNode, startChromID, endChromID, chromosomeIDMap);

        return chromosomeIDMap;
    }

    // prints out the B+ Tree  nodes and leaves
    public void print() {

       // check if read in
       if(!treeHeader.isHeaderOK()){
            int badMagic = treeHeader.getMagic();
            log.error("Error reading B+ tree header: bad magic = " + badMagic);
           return;
       }

        // print B+ tree header
        treeHeader.print();

        // print  B+ tree node and leaf items - recursively
        if(rootNode != null)
            rootNode.printItems();
   }

    /*
    *   Method finds and returns the chromosome ID for the specified chromosome key.
    *
    *   Note: This method recursively calls itself, traversing the full B+ tree until
    *       either the chromosome name key is found and returns a valid chromosome ID,
    *       or exits with a -1 value.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       chromKey - chromosome name key of valid key size.
    *
    *   Returns:
    *       Valid chromosome ID if >= 0; else -1 for not found.
    * */
    private int findChromosomeID( BPTreeNode thisNode, String chromKey){
        int chromID = -1;    // until found

        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){
               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);
               if(leaf == null){
                    log.error("Error finding B+ tree leaf nodes, corruption suspected");
                    throw new RuntimeException("Error reading B+ tree leaf nodes, corruption suspected");
               }

               // test chromosome key match
               if(leaf.chromKeysMatch(chromKey)){
                   chromID = leaf.getChromID();
                   break;
               }
               // else check next leaf
           }
        }
        else {
           // check all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){

               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);              
               BPTreeNode childNode =  childItem.getChildNode();

               // check if key is in the node range
               String lowestKey = childNode.getLowestChromKey();
               String highestKey = childNode.getHighestChromKey();

               // test name key against key range
               if(chromKey.compareTo(lowestKey) >= 0
                       && chromKey.compareTo(highestKey) <= 0) {

                    // keep going until leaf items are checked
                    chromID = findChromosomeID(childNode, chromKey);

                    // check for chromKey match
                    if(chromID >= 0)
                        break;
               }
           }
        }

        return chromID;
    }

    /*
    *   Method finds and returns the chromosome name for the specified chromosome ID.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       chromID - B+ tree chromosome ID supplied for the chromosome key
    *
    *   Returns:
    *       chromosome name if found; else a null string.
    * */
    private String findChromosomeName( BPTreeNode thisNode, int chromID){

        String chromKey = null; // mark unfound condition as an empty string

        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){
               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);

               if(leaf.getChromID() == chromID){ // mChromosome key match
                   chromKey = leaf.getChromKey();
                   break;
               }
               // else check next leaf
           }
        }
        else {
           // check all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){

               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);
               BPTreeNode childNode =  childItem.getChildNode();

               // check if key is in the node range
               int lowestID = childNode.getLowestChromID();
               int highestID = childNode.getHighestChromID();

               // test chromosome ID against node ID range
               if(chromID >= lowestID && chromID <= highestID) {

                    // keep going until leaf items are checked
                    chromKey = findChromosomeName(childNode, chromID);

                    // check for chromosome ID match
                    if(chromKey != null)
                        break;
               }
           }
        }

        return chromKey;
    }

    /*
    *   Method finds and returns all chromosome names in the B+ tree.
    *
    *   Note: This method calls itself recursively until the full B+ tree is traversed.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       chromosomeList - list of all chromosome names found.
    *
    *   Returns:
    *       Chromosome names found are added to the chromosome list passed in.
    * */
    public void findAllChromosomeNames( BPTreeNode thisNode, ArrayList chromosomeList){

        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           // add all leaf names
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){

               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);
               chromosomeList.add(leaf.getChromKey());
           }
        }
        else {
           // get all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){

               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);
               BPTreeNode childNode = childItem.getChildNode();

               // keep going until leaf items are extracted
               findAllChromosomeNames(childNode, chromosomeList);
           }
        }
    }

    /*
    *   Method finds and returns (chromosome ID, chromosome key name) pairs for the specified ID range.
    *
    *   Parameters:
    *       thisNode - tree node to start search
    *       startChromID - starting chromosome ID for the chromosome range
    *       endChromID - ending chromosome ID for the chromosome range
    *
    *   Returns:
    *       (chromosome ID, chromosome key name) items are added to the collection passed in.
    * */
    private void findChromosomeMap( BPTreeNode thisNode, int startChromID, int endChromID,
                                        HashMap chromosomeMap){
        int chromID;
        int lowestID;
        int highestID;

        // check if node is disjoint
        lowestID = thisNode.getLowestChromID();
        if(lowestID > endChromID)
            return;

        highestID = thisNode.getHighestChromID();
        if(highestID < startChromID)
            return; 

        // search down the tree recursively starting with the root node
        if(thisNode.isLeaf())
        {
           int nLeaves = thisNode.getItemCount();
           for(int index = 0; index < nLeaves; ++index){

               BPTreeLeafNodeItem leaf = (BPTreeLeafNodeItem)thisNode.getItem(index);
               chromID = leaf.getChromID();

               // check for chromosome range match
               if( chromID >= startChromID && chromID <= endChromID ){
                   chromosomeMap.put(chromID, leaf.getChromKey());
               }
               // leaf ID's are in ascending order; check for going out of range
               else if(chromID > endChromID)
                   break;
           }
        }
        else {
           // check all child nodes
           int nNodes = thisNode.getItemCount();
           for(int index = 0; index < nNodes; ++index){

               BPTreeChildNodeItem childItem = (BPTreeChildNodeItem)thisNode.getItem(index);
               BPTreeNode childNode =  childItem.getChildNode();

               // check if keys are in the node range
               lowestID = childNode.getLowestChromID();
               highestID = childNode.getHighestChromID();

               // test for chromosome range intersections
               if( lowestID <= endChromID && highestID >= startChromID )
                    findChromosomeMap(childNode, startChromID, endChromID, chromosomeMap);

               // test node ID range which is always in ascending order going out of range
               else if(lowestID > endChromID)
                   break;   //
           }
        }
    }

    /*
    *   Method reads in the B+ tree nodes from the file, recursively.
    *
    *   Parameters:
    *       fis - file input stream handle
    *       fileOffset - file offset for B+ tree header
    *       keySize - chromosome name key size in characters
    *       parent - parent node
    *       isLowToHigh - if true, indicates byte order is low to high; else is high to low
    *
    *   Returns:
     *      Boolean which indicates if the B+ tree header was read correctly, with
    *       true for success, false for failure to find the header information.
    * */
    private BPTreeNode readBPTreeNode(SeekableStream fis, long fileOffset,
                                      BPTreeNode parent, boolean isLowToHigh){

        LittleEndianInputStream lbdis = null;     // low to high byte reader
        DataInputStream bdis = null;        // high to low byte reader

        // set up for node format
        byte[] buffer = new byte[BPTREE_NODE_FORMAT_SIZE];
        BPTreeNode thisNode = null;
        BPTreeNode childNode = null;

        byte type;
        byte bval;
        int itemCount;
        int itemSize;
        boolean isLeaf;

        try {

           // Read node format into a buffer
           fis.seek(fileOffset);
           fis.readFully(buffer);

           if(isLowToHigh)
                lbdis = new LittleEndianInputStream(new ByteArrayInputStream(buffer));
           else
                bdis = new DataInputStream(new ByteArrayInputStream(buffer));

           // find node type
           if(isLowToHigh)
                type = lbdis.readByte();
           else
                type = bdis.readByte();

           // create the B+ tree node
           if(type == 1) {
               isLeaf = true;
               thisNode = new BPTreeLeafNode(++nodeCount);
           }
           else {
               isLeaf = false;
               thisNode = new BPTreeChildNode(++nodeCount);
           }

           if(isLowToHigh) {
                bval = lbdis.readByte();      // reserved - not currently used
                itemCount = lbdis.readShort();
           }
           else {
                bval = bdis.readByte();      // reserved - not currently used
                itemCount = bdis.readShort();
           }

            // Note: B+ tree node item size is the same for leaf and child items
            itemSize =  BPTREE_NODE_ITEM_SIZE + this.keySize;
            int totalSize = itemSize * itemCount;
            byte[] itemBuffer = new byte[totalSize];
            fis.readFully(itemBuffer);

            if(isLowToHigh)
                 lbdis = new LittleEndianInputStream(new ByteArrayInputStream(itemBuffer));
             else
                 bdis = new DataInputStream(new ByteArrayInputStream(itemBuffer));

            // get the node items - leaves or child nodes
            for(int item = 0; item < itemCount; ++item) {

               // always extract the key from the node format
               char[] keychars = new char[keySize];  // + 1 for 0 byte
               int index;
               for(index = 0; index < keySize; ++index) {

                    if(isLowToHigh)
                        bval = lbdis.readByte();
                    else
                        bval = bdis.readByte();

                    keychars[index] = (char)bval;
               }

               String key = new String(keychars).trim();
                
               int chromID;
               int chromSize;
               long childOffset;

               if(isLeaf) {
                    if(isLowToHigh) {
                        chromID = lbdis.readInt();
                        chromSize = lbdis.readInt();
                    }
                    else {
                        chromID = bdis.readInt();
                        chromSize = bdis.readInt();
                    }

                    // insert leaf items
                    BPTreeLeafNodeItem leafItem = new BPTreeLeafNodeItem(++leafCount, key, chromID, chromSize);
                    thisNode.insertItem(leafItem);
               }
               else {
                   // get the child node pointed to in the node item
                   if(isLowToHigh)
                        childOffset =  lbdis.readLong();
                   else
                        childOffset =  bdis.readLong();

                   childNode = readBPTreeNode(this.fis, childOffset, thisNode, isLowToHigh);

                   // insert child node item 
                   BPTreeChildNodeItem childItem = new BPTreeChildNodeItem(item, key, childNode);
                   thisNode.insertItem(childItem);
                }

                 fileOffset += itemSize;
           }

        }catch(IOException ex) {
           log.error("Error reading B+ tree node " + ex);
           throw new RuntimeException("Error reading B+ tree node \n ", ex);
        }

        // success: return node
        return thisNode;
   }


}