All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.AnnotateTOPM Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package net.maizegenetics.analysis.gbs;

import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.map.*;
import net.maizegenetics.dna.map.TagMappingInfoV3.Aligner;
import net.maizegenetics.dna.tag.SAMUtils;
import net.maizegenetics.dna.tag.TagCounts;
import net.maizegenetics.dna.tag.TagsByTaxa.FilePacking;
import net.maizegenetics.util.MultiMemberGZIPInputStream;

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.TreeSet;

/**
 * Methods to annotate TOPM file, including adding mapping info from aligners, adding PE tag position and genetic position, model prediction for the best position
 * @author Fei Lu
 */
public class AnnotateTOPM {
    /**TOPM file that will be annotated*/
    TagsOnPhysicalMapV3 topm;
    /**Record Sam record. When tmiBuffers[0] is full, output to TOPM block. Substitute tmiBuffers[i] with tmiBuffers[i+1]
     * Size = numBuffers x maxMappingNum(-K option) x TOPM CHUNK_SIZE
     * Multiple buffers are used because the output in SAM is not exactly the order of input tag, roughly in the same order though
     */
    TagMappingInfoV3[][][] tmiBuffers = null;
    /**check if tmiBuffer[0] is full for output*/
    boolean[][] bufferLights = null;
    /**Number of buffers*/
    int bufferNum = 2;
    /**record how many mapping tags are imported in each buffer*/
    int[] lightCounts;
    /**Actual tag index of each tmiBuffers[i]*/
    int[] bufferStartTagIndex = null;
    /**Tag index range of the tmiBuffers*/
    int[] bufferTagIndexRange = null;
    /**When the second buffer has filled at least with this cutoff, update buffer*/
    int updateBufferCountCutoff;
    
    public static enum EvidenceType {
        GM((byte)1, 0),
        PE((byte)(1<<1), 1),
        SingleBestBowtie2((byte)(1<<2), 2),
        BestBWA((byte)(1<<3), 3),
        SingleBestBlast((byte)(1<<4), 4),
        SingleBestBWAMEM((byte)(1<<5), 5);
        private byte typeCode;
        private int index;
        EvidenceType (byte value, int indexInBestEvidence) {
            typeCode = value;
            index = indexInBestEvidence;
        }
        
        public byte getCode () {
            return typeCode;
        }
        
        public int getIndex () {
            return index;
        }
        
        public boolean getIfHasEvidence (AnnotateTOPM.EvidenceType type, boolean[] ifEvidence) {
            return ifEvidence[type.getIndex()];
        }
        
        public static byte code (boolean[] ifEvidence) {
            int evidenceByte = 0;
            for (int i = 0; i < ifEvidence.length; i++) {
                evidenceByte = evidenceByte << 1;
                if (ifEvidence[i] == true) evidenceByte = evidenceByte|1;              
            }
            return (byte)evidenceByte;
        }
        
        public static boolean[] deCode (byte evidenceByte) {
            boolean[] ifEvidence = new boolean[AnnotateTOPM.EvidenceType.getSize()];
            byte test = 1;
            for (int i = 0; i < ifEvidence.length; i++) {
                int value = (evidenceByte>>i)&test;
                if (value == 0) ifEvidence[ifEvidence.length-i-1] = false;
                else ifEvidence[ifEvidence.length-i-1] = true;
            }
            return ifEvidence;
        }
        
        public static int getSize () {
            return AnnotateTOPM.EvidenceType.values().length;
        }
        
    }
    
    /**
     * Constructor from a TOPM file
     * @param topm 
     */
    public AnnotateTOPM (TagsOnPhysicalMapV3 topm) {
        this.topm = topm;   
    }
    
    /**
     * Import prediction result to TOPM, contain hard coding on map index
     * @param indexDirS
     * @param predictDirS
     * @param priorityAligner 
     */
    public void annotateBestMappingImport (String indexDirS, String predictDirS, Aligner priorityAligner) {
        byte[] bestStrand = new byte[topm.getTagCount()];
        int[] bestChr = new int[topm.getTagCount()];
        int[] bestStartPos = new int[topm.getTagCount()];
        int[] bestEndPos = new int[topm.getTagCount()];
        byte[] bestDivergence = new byte[topm.getTagCount()];
        byte[] bestMapP = new byte[topm.getTagCount()];
        byte[] bestDcoP = new byte[topm.getTagCount()];
        byte[] multimaps = new byte[topm.getTagCount()];
        byte[] bestEvidence = new byte[topm.getTagCount()];
        byte[] bestMapIndex = new byte[topm.getTagCount()];
        int fileNum = new File(predictDirS).listFiles().length;
        for (int i = 0; i < fileNum; i++) {
            String predictFileS = String.valueOf(i)+".out.txt";
            predictFileS = new File(predictDirS, predictFileS).getAbsolutePath();
            String indexFileS = String.valueOf(i)+".index.txt";
            indexFileS = new File(indexDirS, indexFileS).getAbsolutePath();
            int currentTagIndex = -1;
            try {
                BufferedReader brp = new BufferedReader (new FileReader (predictFileS), 65536);
                BufferedReader bri = new BufferedReader (new FileReader (indexFileS), 65536);
                for (int j = 0; j < 5; j++) brp.readLine();
                bri.readLine();
                String temp;
                int tagIndex, mapIndex;
                String[] tem;
                ArrayList mapIndexList = new ArrayList();
                Integer[] mapIndices;
                while ((temp = bri.readLine()) != null) {
                    tem = temp.split("\t");
                    tagIndex = Integer.parseInt(tem[0]);
                    mapIndex = Integer.parseInt(tem[1]);
                    if (tagIndex != currentTagIndex) {
                        if (currentTagIndex != -1) {
                            mapIndices = mapIndexList.toArray(new Integer[mapIndexList.size()]);
                            this.incorperateBestGeneticMapping(currentTagIndex, mapIndices, brp, bestStrand, bestChr, bestStartPos, bestEndPos, bestDivergence, bestMapP, bestDcoP, bestEvidence, bestMapIndex);
                        }
                        mapIndexList = new ArrayList();
                        currentTagIndex = tagIndex;
                    }
                    mapIndexList.add(mapIndex);
                }
                mapIndices = mapIndexList.toArray(new Integer[mapIndexList.size()]);
                this.incorperateBestGeneticMapping(currentTagIndex, mapIndices, brp, bestStrand, bestChr, bestStartPos, bestEndPos, bestDivergence, bestMapP, bestDcoP, bestEvidence, bestMapIndex);
            }
            catch (Exception e) {
                System.out.println(e.toString());
                System.out.println(currentTagIndex);
                System.exit(0);
            }
        }
        int[] priorityIndex = topm.getMappingIndicesOfAligner(priorityAligner);
        int[] alignerStartMapIndex = {0,5,10,15,20,25};
        int[][] alignerIndex = new int[alignerStartMapIndex.length][];
        alignerIndex[0] = topm.getMappingIndicesOfAligner(Aligner.Bowtie2);
        alignerIndex[1] = topm.getMappingIndicesOfAligner(Aligner.BWA);
        alignerIndex[2] = topm.getMappingIndicesOfAligner(Aligner.Blast);
        alignerIndex[3] = topm.getMappingIndicesOfAligner(Aligner.BWAMEM);
        alignerIndex[4] = topm.getMappingIndicesOfAligner(Aligner.PEEnd1);
        alignerIndex[5] = topm.getMappingIndicesOfAligner(Aligner.PEEnd2);
        for (int i = 0; i < topm.getTagCount(); i++) {
            if (bestStrand[i] == 0) {
                int numRank0 = this.getNumOfRank0(i, priorityIndex);
                TagMappingInfoV3 tmi = topm.getMappingInfo(i, priorityIndex[0]);
                if (numRank0 == 1) {
                    bestStrand[i] = tmi.strand;
                    bestChr[i] = tmi.chromosome;
                    bestStartPos[i] = tmi.startPosition;
                    bestEndPos[i] = tmi.endPosition;
                    bestDivergence[i] = tmi.divergence;
                    bestMapP[i] = tmi.mapP;
                    bestDcoP[i] = tmi.dcoP;
                    bestMapIndex[i] = (byte)priorityIndex[0];
                    multimaps[i] = 1;
                }
                else {
                    bestStrand[i] = Byte.MIN_VALUE;
                    bestChr[i] = Integer.MIN_VALUE;
                    bestStartPos[i] = Integer.MIN_VALUE;
                    bestEndPos[i] = Integer.MIN_VALUE;
                    bestDivergence[i] = Byte.MIN_VALUE;
                    bestMapP[i] = Byte.MIN_VALUE;
                    bestDcoP[i] = Byte.MIN_VALUE;
                    bestMapIndex[i] = Byte.MIN_VALUE;
                    if (tmi.strand == Byte.MIN_VALUE) multimaps[i] = 0;
                    else multimaps[i] = 99;
                }
            }
            boolean[] ifEvidence = AnnotateTOPM.EvidenceType.deCode(bestEvidence[i]);
            TagMappingInfoV3 tmi = topm.getMappingInfo(i, alignerIndex[4][0]);
            if (tmi.chromosome == bestChr[i] && tmi.startPosition == bestStartPos[i] && tmi.strand == bestStrand[i]) {
                ifEvidence[AnnotateTOPM.EvidenceType.PE.getIndex()] = true;
            }
            int numRank0 = this.getNumOfRank0(i, alignerIndex[0]);
            tmi = topm.getMappingInfo(i, alignerIndex[0][0]);
            if (numRank0 == 0 && tmi.chromosome == bestChr[i] && tmi.startPosition == bestStartPos[i] && tmi.strand == bestStrand[i]) {
                ifEvidence[AnnotateTOPM.EvidenceType.SingleBestBowtie2.getIndex()] = true;
            }
            tmi = topm.getMappingInfo(i, alignerIndex[1][0]);
            if (tmi.chromosome == bestChr[i] && tmi.startPosition == bestStartPos[i] && tmi.strand == bestStrand[i]) {
                ifEvidence[AnnotateTOPM.EvidenceType.BestBWA.getIndex()] = true;
            }
            tmi = topm.getMappingInfo(i, alignerIndex[2][0]);
            if (numRank0 == 0 && tmi.chromosome == bestChr[i] && tmi.startPosition == bestStartPos[i] && tmi.strand == bestStrand[i]) {
                ifEvidence[AnnotateTOPM.EvidenceType.SingleBestBlast.getIndex()] = true;
            }
            tmi = topm.getMappingInfo(i, alignerIndex[3][0]);
            if (numRank0 == 0 && tmi.chromosome == bestChr[i] && tmi.startPosition == bestStartPos[i] && tmi.strand == bestStrand[i]) {
                ifEvidence[AnnotateTOPM.EvidenceType.SingleBestBWAMEM.getIndex()] = true;
            }
            bestEvidence[i] = AnnotateTOPM.EvidenceType.code(ifEvidence);
        }
        topm.writeBestMappingDataSets(bestStrand, bestChr, bestStartPos, bestEndPos, bestDivergence, bestMapP, bestDcoP, multimaps, bestEvidence, bestMapIndex);
    }
    
    /**
     * Incorperate the alignment with best genetic mapping, contain hard coding on map index
     * @param tagIndex
     * @param mapIndices
     * @param br
     * @param bestStrand
     * @param bestChr
     * @param bestEvidence 
     */
    private void incorperateBestGeneticMapping (int tagIndex, Integer[] mapIndices, BufferedReader br, byte[] bestStrand, int[] bestChr, int[] bestStartPos, int[] bestEndPos, byte[] bestDivergence, byte[] bestMapP, byte[] bestDcoP, byte[] bestEvidence, byte[] bestMapIndices) {
        ArrayList yMapIndexList = new ArrayList();
        try {
            for (int i = 0; i < mapIndices.length; i++) {
                String in = br.readLine();
                String[] temp;
                if (in.startsWith(" ")) {
                    temp = in.split("\\s+")[3].split(":");
                }
                else {
                    temp = in.split("\\s+")[2].split(":");
                }
                if (temp[1].equals("N")) continue;
                yMapIndexList.add(mapIndices[i]);
            }
            if (yMapIndexList.isEmpty()) return;
            double pBest = 1;
            int bestMapIndex = -1;
            for (int i = 0; i < yMapIndexList.size(); i++) {
                TagGeneticMappingInfo tgmi = topm.getGeneticMappingInfo(tagIndex, yMapIndexList.get(i));
                if (tgmi.p < 0) continue;
                if (tgmi.p < pBest) {
                    pBest = tgmi.p;
                    bestMapIndex = yMapIndexList.get(i);
                }
            }
            if (bestMapIndex == -1) return;
            if (bestMapIndex >= 25) bestMapIndex-=5;
            TagMappingInfoV3 tmi = topm.getMappingInfo(tagIndex, bestMapIndex);
            bestStrand[tagIndex] = tmi.strand;
            bestChr[tagIndex] = tmi.chromosome;
            bestStartPos[tagIndex] = tmi.startPosition;
            bestEndPos[tagIndex] = tmi.endPosition;
            bestDivergence[tagIndex] = tmi.divergence;
            bestMapP[tagIndex] = tmi.mapP;
            bestDcoP[tagIndex] = tmi.dcoP;
            bestEvidence[tagIndex] = (byte)(1< topm.getTagCount()) endTagIndex = topm.getTagCount();
            try {
                BufferedWriter bwp = new BufferedWriter (new FileWriter(predictFileS), 65536);
                BufferedWriter bwi = new BufferedWriter (new FileWriter(indexFileS), 65536);
                bwp.write("@relation trainingFinalSet\n\n");
                for (int j = 0; j < attributeName.length-1; j++) {
                    bwp.write("@attribute "+ attributeName[j]+ " numeric\n");
                }
                bwp.write("@attribute Rorw {Y,N}\n");
                bwp.write("\n@data\n");
                bwi.write("TagIndex\tMapIndex");
                bwi.newLine();
                for (int j = startTagIndex; j < endTagIndex; j++) {
                    int cnt = 0;
                    long[] tag = topm.getTag(j);
                    int index = tc.getTagIndex(tag);
                    if (index < 0) {
                        System.out.println("TagCount file and TOPM file don't match, program quits");
                        System.exit(0);
                    }
                    int readCount = tc.getReadCount(index);
                    String seq = BaseEncoder.getSequenceFromLong(tag);
                    for (int k = 0; k < topm.getTagLength(j); k++) {
                        if (seq.charAt(k) == 'G' || seq.charAt(k) == 'C') cnt++;
                    }
                    double gc = (double)cnt/topm.getTagLength(j);
                    int[] numOfRank0 = new int[alignerStartMapIndex.length];
                    int[] numOfAlign = new int[alignerStartMapIndex.length];
                    for (int k = 0; k < numOfRank0.length; k++) {
                        numOfRank0[k] = this.getNumOfRank0(j, alignerIndex[k]);
                        numOfAlign[k] = this.getNumOfAlign(j, alignerIndex[k]);
                    }
                    double pBest = this.getPBest(j);
                    if (pBest == 1) continue;
                    int numOfAlignAll = this.getNumOfAlignAll(j);
                    for (int k = 0; k < topm.getMappingNum(); k++) {
                        TagMappingInfoV3 tmi = topm.getMappingInfo(j, k);
                        TagGeneticMappingInfo tgmi = topm.getGeneticMappingInfo(j, k);
                        if (tmi.chromosome < 0) continue;
                        if (tgmi.p < 0) continue;
                        bwp.write(String.valueOf(this.boxcox(readCount, -0.181818))+","); 
                        bwp.write(String.valueOf(topm.getTagLength(j))+",");
                        bwp.write(String.valueOf(gc)+",");
                        bwp.write(String.valueOf(tmi.chromosome)+",");
                        bwp.write(String.valueOf(tmi.startPosition)+",");
                        bwp.write(String.valueOf(tmi.mappingSource)+",");
                        bwp.write(String.valueOf(tmi.mappingRank)+",");
                        if (tmi.mappingScore < 0) {
                            bwp.write(missing+",");
                        }
                        else {
                            bwp.write(String.valueOf(tmi.mappingScore)+",");
                        }
                        
                        if (tgmi.sigSiteNum < 0) {
                            bwp.write(missing+",");
                        }
                        else {
                            double boxValue = this.boxcox(tgmi.sigSiteNum, 0.101010); //262414 tags
                            bwp.write(String.valueOf(boxValue)+",");
                        }
                        if (tgmi.sigSiteRange < 0) {
                            bwp.write(missing+",");
                        }
                        else {
                            double boxValue = this.boxcox(tgmi.sigSiteRange, 0.424242); // 65536 tags and 262414 tags
                            bwp.write(String.valueOf(boxValue)+",");
                        }
                        if (tgmi.p < 0) {
                            bwp.write(missing+",");
                            bwp.write(missing+",");
                        }
                        else {
                            double boxValue = this.boxcox(minusLog10P(tgmi.p), -0.060606); // 262414 tags
                            bwp.write(String.valueOf(boxValue)+",");
                            if (tgmi.p == 0) {
                                bwp.write(String.valueOf(pBest/Double.MIN_VALUE)+",");
                            }
                            else {
                                bwp.write(String.valueOf(pBest/tgmi.p)+",");
                            }  
                        }
                        index = Arrays.binarySearch(alignerStartMapIndex, k);
                        if (index < 0) index = -index -2;
                        bwp.write(String.valueOf(numOfRank0[index])+",");
                        bwp.write(String.valueOf(numOfAlign[index])+",");
                        bwp.write(String.valueOf(numOfAlignAll)+",");
                        bwp.write("N,");
                        bwp.newLine();
                        bwi.write(String.valueOf(j)+"\t"+String.valueOf(k));
                        bwi.newLine();
                    }
                }
                bwp.flush();
                bwi.flush();
                bwp.close();
                bwi.close();
            }
            catch (Exception e) {
                System.out.println(e.toString());
                System.exit(1);
            }
        }
        for (int i = 0; i < fileNum; i++) {
            String predictFileS = String.valueOf(i)+".pre.arff";
            predictFileS = new File(inputDirS, predictFileS).getAbsolutePath();
            String outputFileS = String.valueOf(i)+".out.txt";
            outputFileS = new File(outputDirS, outputFileS).getAbsolutePath();
            try {
                Runtime run = Runtime.getRuntime();
                String cmd = "cmd /c java weka.classifiers.trees.RandomForest -p 0 -T " + predictFileS + " -l " + modelFileS + " > " +outputFileS;
                System.out.println(cmd);
                Process p = run.exec(cmd);
                p.waitFor();
                System.out.println("Prediction is made at " + outputFileS);
            }
            catch (Exception e) {
                System.out.println(e.toString());
                System.exit(1);
            }
        }
    }
    
    /**
     * Return the number of Rank0 in an aligner
     * @param tagIndex
     * @param mapIndex
     * @return 
     */
    public int getNumOfRank0 (int tagIndex, int[] mapIndex) {
        int cnt = 0;
        for (int i = 0; i < mapIndex.length; i++) {
            TagMappingInfoV3 tmi = topm.getMappingInfo(tagIndex, mapIndex[i]);
            if (tmi.chromosome < 0) continue;
            if (tmi.mappingRank != 0) continue;
            cnt++;
        }
        return cnt;
    }
    
    /**
     * Return number of alignment in an aligner
     * @param tagIndex
     * @param mapIndex
     * @return 
     */
    public int getNumOfAlign (int tagIndex, int[] mapIndex) {
        int cnt = 0;
        for (int i = 0; i < mapIndex.length; i++) {
            TagMappingInfoV3 tmi = topm.getMappingInfo(tagIndex, mapIndex[i]);
            if (tmi.chromosome < 0) continue;
            cnt++;
        }
        return cnt;
    }
    
    /**
     * Return the most significant p-value across all hypotheses
     * @param tagIndex
     * @return 
     */
    public double getPBest (int tagIndex) {
        double p = 1;
        for (int i = 0; i < topm.getMappingNum(); i++) {
            TagGeneticMappingInfo tgmi = topm.getGeneticMappingInfo(tagIndex, i);
            if (tgmi.p < 0) continue;
            if (tgmi.p < p) p = tgmi.p;
        }
        if (p == 0) p = Double.MIN_VALUE;
        return p;
    }
    
    /**
     * Return number of alignment hypotheses across all aligners, excluding PEEnd1 and PEEnd2 
     * @param tagIndex
     * @return 
     */
    public int getNumOfAlignAll (int tagIndex) {
        int cnt = 0;
        for (int i = 0; i < 20; i++) {
            TagMappingInfoV3 tmi = topm.getMappingInfo(tagIndex, i);
            if (tmi.chromosome < 0) continue;
            cnt++;
        }
        return cnt;
    }
    
    /**
     * Return -log10(p-value), if p-value doesn't exist(p < 0), reutrn Double.NEGATIVE_INFINITY
     * @param p
     * @return 
     */
    public double minusLog10P (double p) {
        if (p < 0) return Double.NEGATIVE_INFINITY;
        p = -Math.log10(p);
        if (p == Double.POSITIVE_INFINITY) p = -Math.log10(Double.MIN_VALUE);
        return p;
    }
    
    /**
     * Boxcox transform
     * @param y
     * @param lambda
     * @return 
     */
    private double boxcox (double y, double lambda) {
        if (lambda != 0) {
            return (Math.pow(y, lambda)-1)/lambda;
        }
        else {
            return Math.log(y);
        }
    }
    
    /**
     * Annotate the TOPM file using Bowtie2
     * @param samFileS
     * @param maxMappingNum is the max number of multiple alignment
     */
    public void annotateWithBowtie2 (String samFileS, int maxMappingNum) {
        String[] dataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        this.iniTMIBuffers(bufferNum, maxMappingNum);
        System.out.println("Reading SAM format tag alignment (Bowtie2) from: " + samFileS);
        System.out.println("Coverting SAM to TOPMHDF5...");
        byte mappingSource = Aligner.Bowtie2.getValue();
        int chunkCnt = 0;
        try {
            BufferedReader br;
            if (samFileS.endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(new File(samFileS)))));
            } else {
                br = new BufferedReader(new FileReader(new File(samFileS)), 65536);
            }
            while(br.readLine().startsWith("@PG")==false) {};
            String inputStr = null;
            while((inputStr = br.readLine())!=null) {
                String[] temp =inputStr.split("\\s");
                int orientiation=Integer.parseInt(temp[1]);
                int chr = Integer.MIN_VALUE;
                byte strand = Byte.MIN_VALUE;
                int startPos = Integer.MIN_VALUE;
                int endPos = Integer.MIN_VALUE;
                short mappingScore = Short.MIN_VALUE;
                byte divergence = Byte.MIN_VALUE;
                byte perfectMatch = Byte.MIN_VALUE;
                String seqS = temp[9];
                if (orientiation == 4) {
                    
                }
                else if (orientiation == 16 || orientiation == 272) {
                    seqS = BaseEncoder.getReverseComplement(seqS);
                    chr = Integer.parseInt(temp[2]);
                    strand = -1;
                    int[] alignSpan = SAMUtils.adjustCoordinates(temp[5], Integer.parseInt(temp[3]));
                    startPos = alignSpan[1];
                    endPos = alignSpan[0];
                    mappingScore = Short.parseShort(temp[11].split(":")[2]);
                    if (temp[17].startsWith("NM")) {
                        divergence = Byte.parseByte(temp[17].split(":")[2]);
                    }
                    else {
                        divergence = Byte.parseByte(temp[16].split(":")[2]);
                    }
                    if (temp[5].equals(String.valueOf(seqS.length())+"M") && divergence == 0) perfectMatch = 1;
                    else perfectMatch = 0;
                }
                else {
                    chr = Integer.parseInt(temp[2]);
                    strand = 1;
                    int[] alignSpan = SAMUtils.adjustCoordinates(temp[5], Integer.parseInt(temp[3]));
                    startPos = alignSpan[0];
                    endPos = alignSpan[1];
                    mappingScore = Short.parseShort(temp[11].split(":")[2]);
                    if (temp[17].startsWith("NM")) {
                        divergence = Byte.parseByte(temp[17].split(":")[2]);
                    }
                    else {
                        divergence = Byte.parseByte(temp[16].split(":")[2]);
                    }
                    if (temp[5].equals(String.valueOf(seqS.length())+"M") && divergence == 0) perfectMatch = 1;
                    else perfectMatch = 0;
                }
                TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, endPos, divergence, perfectMatch, mappingSource, mappingScore);
                long[] seq = BaseEncoder.getLongArrayFromSeq(seqS,topm.getTagSizeInLong()*BaseEncoder.chunkSize);
                int tagIndex = topm.getTagIndex(seq);
                if (tagIndex < this.bufferTagIndexRange[0] || tagIndex >= this.bufferTagIndexRange[1]) {
                    System.out.println("The index of the tag from sam file is out of buffer range. Program quits.");
                    System.out.println("Please increase the buffer number");
                    System.exit(1);
                }
                int bufferIndex = Arrays.binarySearch(bufferStartTagIndex, tagIndex);
                if (bufferIndex < 0) bufferIndex = -bufferIndex-2;
                int bufferTagIndex = tagIndex % topm.getChunkSize();
                int mappingDatasetIndex = this.getMappingDatasetIndex(bufferIndex, bufferTagIndex);
                if (mappingDatasetIndex == Integer.MIN_VALUE) continue;
                tmiBuffers[bufferIndex][mappingDatasetIndex][bufferTagIndex] = theTMI;
                if (bufferLights[bufferIndex][bufferTagIndex] == false) {
                    lightCounts[bufferIndex]++;
                }
                bufferLights[bufferIndex][bufferTagIndex] = true;
                if (lightCounts[0] == topm.getChunkSize() && lightCounts[1] > this.updateBufferCountCutoff) {
                    this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
                    this.updateTMIBuffer();
                    System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
                }   
            }
            this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
            topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
            System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
            br.close();
        } catch (Exception e) {
            
            e.printStackTrace();
            System.exit(1);
        }
    }
    
    /**
     * Annotate the TOPM with BWA
     * @param samFileS
     * @param maxMappingNum 
     */
    public void annotateWithBWA (String samFileS, int maxMappingNum) {
        String[] dataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        this.iniTMIBuffers(bufferNum, maxMappingNum);
        System.out.println("Reading SAM format tag alignment (BWA) from: " + samFileS);
        System.out.println("Coverting SAM to TOPMHDF5...");
        byte mappingSource = Aligner.BWA.getValue();
        int chunkCnt = 0;
        try {
            BufferedReader br;
            if (samFileS.endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(new File(samFileS)))));
            } else {
                br = new BufferedReader(new FileReader(new File(samFileS)), 65536);
            }
            String inputStr = null;
            while ((inputStr=br.readLine()).startsWith("@")) {};
            while(inputStr!=null) {
                String[] temp =inputStr.split("\\s");
                int orientiation=Integer.parseInt(temp[1]);
                int chr = Integer.MIN_VALUE;
                byte strand = Byte.MIN_VALUE;
                int startPos = Integer.MIN_VALUE;
                int endPos = Integer.MIN_VALUE;
                short mappingScore = Short.MIN_VALUE;
                byte divergence = Byte.MIN_VALUE;
                byte perfectMatch = Byte.MIN_VALUE;
                String seqS = temp[9];
                String XAString = null;
                if (temp[temp.length-1].startsWith("XA")) {
                    XAString = temp[temp.length-1].replaceFirst("XA:Z:", "");
                }
                if (orientiation == 4) {
                    
                }
                else if (orientiation == 16) {
                    seqS = BaseEncoder.getReverseComplement(seqS);
                    chr = Integer.parseInt(temp[2]);
                    strand = -1;
                    int[] alignSpan = SAMUtils.adjustCoordinates(temp[5], Integer.parseInt(temp[3]));
                    startPos = alignSpan[1];
                    endPos = alignSpan[0];
                    divergence = Byte.parseByte(temp[12].split(":")[2]);
                    if (temp[5].equals(String.valueOf(seqS.length())+"M") && divergence == 0) perfectMatch = 1;
                    else perfectMatch = 0;
                }
                else {
                    chr = Integer.parseInt(temp[2]);
                    strand = 1;
                    int[] alignSpan = SAMUtils.adjustCoordinates(temp[5], Integer.parseInt(temp[3]));
                    startPos = alignSpan[0];
                    endPos = alignSpan[1];
                    divergence = Byte.parseByte(temp[12].split(":")[2]);
                    if (temp[5].equals(String.valueOf(seqS.length())+"M") && divergence == 0) perfectMatch = 1;
                    else perfectMatch = 0;
                }
                TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, endPos, divergence, perfectMatch, mappingSource, mappingScore);
                long[] seq = BaseEncoder.getLongArrayFromSeq(seqS,topm.getTagSizeInLong()*BaseEncoder.chunkSize);
                int tagIndex = topm.getTagIndex(seq);
                if (tagIndex < this.bufferTagIndexRange[0] || tagIndex >= this.bufferTagIndexRange[1]) {
                    System.out.println("The index of the tag from sam file is out of buffer range. Program quits.");
                    System.out.println("Please increase the buffer number");
                    System.exit(1);
                    }  
                int bufferIndex = Arrays.binarySearch(bufferStartTagIndex, tagIndex);
                if (bufferIndex < 0) bufferIndex = -bufferIndex-2;
                int bufferTagIndex = tagIndex % topm.getChunkSize();
                int mappingDatasetIndex = this.getMappingDatasetIndex(bufferIndex, bufferTagIndex);
                if (mappingDatasetIndex == Integer.MIN_VALUE) continue;
                tmiBuffers[bufferIndex][mappingDatasetIndex][bufferTagIndex] = theTMI;
                if (bufferLights[bufferIndex][bufferTagIndex] == false) {
                    lightCounts[bufferIndex]++;
                }
                if (XAString != null) {
                    temp = XAString.split(";");
                    for (int i = 0; i < temp.length; i++) {
                        mappingDatasetIndex = this.getMappingDatasetIndex(bufferIndex, bufferTagIndex);
                        if (mappingDatasetIndex == Integer.MIN_VALUE) break;
                        String[] tem = temp[i].split(",");
                        chr = Integer.parseInt(tem[0]);
                        if (tem[1].startsWith("+")) {
                            strand = 1;
                            int[] alignSpan = SAMUtils.adjustCoordinates(tem[2], Integer.parseInt(tem[1].substring(1)));
                            startPos = alignSpan[1];
                            endPos = alignSpan[0];
                        }
                        else {
                            strand = -1;
                            int[] alignSpan = SAMUtils.adjustCoordinates(tem[2], Integer.parseInt(tem[1].substring(1)));
                            startPos = alignSpan[0];
                            endPos = alignSpan[1];
                        }
                        divergence = Byte.parseByte(tem[3]);
                        if (tem[2].equals(String.valueOf(seqS.length())+"M") && divergence == 0) perfectMatch = 1;
                        else perfectMatch = 0;
                        theTMI = new TagMappingInfoV3(chr, strand, startPos, endPos, divergence, perfectMatch, mappingSource, mappingScore);
                        tmiBuffers[bufferIndex][mappingDatasetIndex][bufferTagIndex] = theTMI;
                    }
                }
                bufferLights[bufferIndex][bufferTagIndex] = true;
                if (lightCounts[0] == topm.getChunkSize() && lightCounts[1] > this.updateBufferCountCutoff) {
                    this.saveBWATMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
                    this.updateTMIBuffer();
                    System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
                }
                inputStr=br.readLine();
            }
            this.saveBWATMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
            topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
            System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
            br.close();
        } catch (Exception e) {
            
            e.printStackTrace();
            System.exit(1);
        }       
    }
    
    /**
     * Annotate the TOPM file using bwa-mem
     * @param samFileS
     * @param maxMappingNum 
     */
    public void annotateWithBWAMEM (String samFileS, int maxMappingNum) {
        String[] dataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        this.iniTMIBuffers(bufferNum, maxMappingNum);
        System.out.println("Reading SAM format tag alignment (BWAMEM) from: " + samFileS);
        System.out.println("Coverting SAM to TOPMHDF5...");
        byte mappingSource = Aligner.BWAMEM.getValue();
        try {
            BufferedReader br;
            if (samFileS.endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(new File(samFileS)))));
            } else {
                br = new BufferedReader(new FileReader(new File(samFileS)), 65536);
            }
            String inputStr = null;
            while ((inputStr=br.readLine()).startsWith("@")) {};
            int mapCnt = 0;
            int tagIndex = 0;
            int chunkCnt = 0;
            int seqLength = Integer.MIN_VALUE;
            while(inputStr!=null) {
                String[] temp =inputStr.split("\\s");
                int orientiation=Integer.parseInt(temp[1]);
                int chr = Integer.MIN_VALUE;
                byte strand = Byte.MIN_VALUE;
                int startPos = Integer.MIN_VALUE;
                int endPos = Integer.MIN_VALUE;
                short mappingScore = Short.MIN_VALUE;
                byte divergence = Byte.MIN_VALUE;
                byte perfectMatch = Byte.MIN_VALUE;
                String seqS = temp[9];
                if (seqS.equals("*")) {
                    mapCnt++;
                }
                else {
                    mapCnt = 1;
                    seqLength = seqS.length();
                }
                if (mapCnt > maxMappingNum) {
                    inputStr=br.readLine();
                    continue;
                }
                if (orientiation == 4) {
                    
                }
                else if (orientiation == 16 || orientiation == 272) {
                    if (!seqS.equals("*")) seqS = BaseEncoder.getReverseComplement(seqS);
                    chr = Integer.parseInt(temp[2]);
                    strand = -1;
                    int[] alignSpan = SAMUtils.adjustCoordinates(temp[5], Integer.parseInt(temp[3]));
                    startPos = alignSpan[1];
                    endPos = alignSpan[0];
                    divergence = Byte.parseByte(temp[11].split(":")[2]);
                    if (temp[5].equals(String.valueOf(seqLength)+"M") && divergence == 0) perfectMatch = 1;
                    else perfectMatch = 0;
                    mappingScore = Byte.parseByte(temp[12].split(":")[2]);
                }
                else if (orientiation == 0 || orientiation == 256) {
                    chr = Integer.parseInt(temp[2]);
                    strand = 1;
                    int[] alignSpan = SAMUtils.adjustCoordinates(temp[5], Integer.parseInt(temp[3]));
                    startPos = alignSpan[0];
                    endPos = alignSpan[1];
                    divergence = Byte.parseByte(temp[11].split(":")[2]);
                    if (temp[5].equals(String.valueOf(seqLength)+"M") && divergence == 0) perfectMatch = 1;
                    perfectMatch = 0;
                    mappingScore = Byte.parseByte(temp[12].split(":")[2]);
                }
                else {
                    //handle flag value of 2048 and 2064, use sequence in SAM can't find the tag in TOPM, since seqs are choped
                    //but seems they are all secondary alignment
                    //2048 and 2064 are very rare 0.04%, ignore for now.
                    //Todo, change tag identification from seq to index
                    inputStr=br.readLine();
                    continue;
                }
                TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, endPos, divergence, perfectMatch, mappingSource, mappingScore);
                if (!seqS.equals("*")) {
                    long[] seq = BaseEncoder.getLongArrayFromSeq(seqS,topm.getTagSizeInLong()*BaseEncoder.chunkSize);
                    tagIndex = topm.getTagIndex(seq);
                    if (tagIndex < this.bufferTagIndexRange[0] || tagIndex >= this.bufferTagIndexRange[1]) {
                        System.out.println("The index of the tag from sam file is out of buffer range. Program quits.");
                        System.out.println("Please increase the buffer number");
                        System.exit(1);
                    }
                }
                int bufferIndex = Arrays.binarySearch(bufferStartTagIndex, tagIndex);
                if (bufferIndex < 0) bufferIndex = -bufferIndex-2;
                int bufferTagIndex = tagIndex % topm.getChunkSize();
                int mappingDatasetIndex = this.getMappingDatasetIndex(bufferIndex, bufferTagIndex);
                if (mappingDatasetIndex == Integer.MIN_VALUE) {
                    inputStr=br.readLine();
                    continue;
                }
                tmiBuffers[bufferIndex][mappingDatasetIndex][bufferTagIndex] = theTMI;
                if (bufferLights[bufferIndex][bufferTagIndex] == false) {
                    lightCounts[bufferIndex]++;
                }
                bufferLights[bufferIndex][bufferTagIndex] = true;
                //since the MT output of bwamem are in the same order the fastq
                if (lightCounts[1] > this.updateBufferCountCutoff) {
                    this.saveBWATMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
                    this.updateTMIBuffer();
                    System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
                }
                inputStr=br.readLine();
            }
            this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
            topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
            System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
            br.close();
        } catch (Exception e) {
            
            e.printStackTrace();
            System.exit(1);
        }       
    }
    
    /**
     * Annotate the TOPM with BLAST from a directory, where slices of blast result are stored
     * @param blastDirS
     * @param maxMappingNum 
     */
    public void annotateWithBlastFromDir (String blastDirS, int maxMappingNum) {
        String[] dataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        this.iniTMIBuffers(2, maxMappingNum);
        System.out.println("Reading BLAST table format tag alignment (BLAST) from: " + blastDirS);
        System.out.println("Coverting BLAST to TOPMHDF5...");
        byte mappingSource = Aligner.Blast.getValue();
        File[] infiles = new File (blastDirS).listFiles();
        Arrays.sort(infiles);
        int chunkCnt = 0;
        try {
            BufferedReader br;
            for (int i = 0; i < infiles.length; i++) {
                System.out.println("Reading BLAST table format tag alignment (BLAST) from: " + infiles[i].getAbsolutePath());
                if (infiles[i].getName().endsWith("gz")) {
                    br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(infiles[i]))));
                }
                else {
                    br = new BufferedReader(new FileReader(infiles[i]), 65536);
                }
                String inputStr = null;
                while((inputStr = br.readLine())!=null) {
                    String[] temp =inputStr.split("\\s+");
                    int chr = Integer.parseInt(temp[1]);
                    byte strand = Byte.MIN_VALUE;
                    int startPos = Integer.parseInt(temp[8]);
                    int endPos = Integer.parseInt(temp[9]);
                    if (startPos < endPos) {
                        strand = 1;
                    }
                    else {
                        strand = -1;
                    }
                    short mappingScore = Short.parseShort(temp[11].replaceAll("\\..+", ""));
                    byte divergence = Byte.valueOf(temp[4]);
                    byte perfectMatch = 0;
                    if (temp[2].startsWith("100")) perfectMatch = 1;
                    int tagIndex = Integer.parseInt(temp[0]);
                    if (tagIndex >= this.bufferStartTagIndex[1]) {
                        int n = (tagIndex - bufferStartTagIndex[1]) /topm.getChunkSize()+1;
                        for (int j = 0; j < n; j++) {
                            this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
                            this.updateTMIBuffer();
                        }  
                    }
                    int bufferIndex = Arrays.binarySearch(bufferStartTagIndex, tagIndex);
                    if (bufferIndex < 0) bufferIndex = -bufferIndex-2;
                    int bufferTagIndex = tagIndex % topm.getChunkSize();
                    int mappingDatasetIndex = this.getMappingDatasetIndex(bufferIndex, bufferTagIndex);
                    if (mappingDatasetIndex == Integer.MIN_VALUE) continue;
                    TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, endPos, divergence, perfectMatch, mappingSource, mappingScore);
                    tmiBuffers[bufferIndex][mappingDatasetIndex][bufferTagIndex] = theTMI;
                }
                br.close();
                System.gc();
            }
            this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
            topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
            System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    /**
     * Annotate the TOPM with BLAST
     * @param blastM8FileS
     * @param maxMappingNum 
     */
    public void annotateWithBLAST (String blastM8FileS, int maxMappingNum) {
        String[] dataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        this.iniTMIBuffers(2, maxMappingNum);
        System.out.println("Reading BLAST table format tag alignment (BLAST) from: " + blastM8FileS);
        System.out.println("Coverting BLAST to TOPMHDF5...");
        byte mappingSource = Aligner.Blast.getValue();
        int chunkCnt = 0;
        try {
            BufferedReader br;
            if (blastM8FileS.endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(new File(blastM8FileS)))));
            } else {
                br = new BufferedReader(new FileReader(new File(blastM8FileS)), 65536);
            }
            String inputStr = null;
            while((inputStr = br.readLine())!=null) {
                String[] temp =inputStr.split("\\s+");
                int chr = Integer.parseInt(temp[1]);
                byte strand = Byte.MIN_VALUE;
                int startPos = Integer.parseInt(temp[8]);
                int endPos = Integer.parseInt(temp[9]);
                if (startPos < endPos) {
                    strand = 1;
                }
                else {
                    strand = -1;
                }
                short mappingScore = Short.parseShort(temp[11].replaceAll("\\..+", ""));
                byte divergence = Byte.valueOf(temp[4]);
                byte perfectMatch = 0;
                if (temp[2].startsWith("100")) perfectMatch = 1;
                int tagIndex = Integer.parseInt(temp[0]);
                if (tagIndex >= this.bufferStartTagIndex[1]) {
                    int n = (tagIndex - bufferStartTagIndex[1]) /topm.getChunkSize()+1;
                    for (int j = 0; j < n; j++) {
                        this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
                        this.updateTMIBuffer();
                    }  
                }
                int bufferIndex = Arrays.binarySearch(bufferStartTagIndex, tagIndex);
                if (bufferIndex < 0) bufferIndex = -bufferIndex-2;
                int bufferTagIndex = tagIndex % topm.getChunkSize();
                int mappingDatasetIndex = this.getMappingDatasetIndex(bufferIndex, bufferTagIndex);
                if (mappingDatasetIndex == Integer.MIN_VALUE) continue;
                TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, endPos, divergence, perfectMatch, mappingSource, mappingScore);
                System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
                tmiBuffers[bufferIndex][mappingDatasetIndex][bufferTagIndex] = theTMI;
            }
            this.saveTMIBufferToTOPM(tmiBuffers[0], dataSetNames, this.bufferStartTagIndex[0]/topm.getChunkSize(), mappingSource);
            topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
            System.out.println(++chunkCnt + " chunks are annotated. " + topm.getChunkNum() + " chunks in total");
            br.close();
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }
    
    /**
     * Annotate the TOPM with PE, using bowtie2 sam file
     * @param PETOPMFileS
     * @param maxMappingNum 
     */
    public void annotateWithPE (String PETOPMFileS, int maxMappingNum) {
        byte forwardMappingSource = Aligner.PEEnd1.getValue(), backMappingSource = Aligner.PEEnd2.getValue();
        PETagsOnPhysicalMapV3 ptopm = new PETagsOnPhysicalMapV3(PETOPMFileS);
        String[] forwardDataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
        String[] backwardDataSetNames = topm.creatTagMappingInfoDatasets(topm.getMappingNum(), maxMappingNum);
        topm.setMappingNum(topm.getMappingNum()+maxMappingNum);
        TagMappingInfoV3[][] forwardBuffer;
        TagMappingInfoV3[][] backBuffer;
        for (int i = 0; i < topm.getChunkNum(); i++) {
            forwardBuffer = this.getPopulateTMIBuffer(maxMappingNum);
            backBuffer = this.getPopulateTMIBuffer(maxMappingNum);
            int startIndex = i*topm.getChunkSize();
            int endIndex = startIndex+topm.getChunkSize();
            if (endIndex > topm.getTagCount()) endIndex = topm.getTagCount();
            for (int j = startIndex; j < endIndex; j++) {
                long[] t = topm.getTag(j);
                int index = ptopm.getTagIndexWithLongestSeq(t);
                if (index == -1) continue;
                int max = ptopm.getMappingNum(index);
                if (max > maxMappingNum) max = maxMappingNum;
                for (int k = 0; k < max; k++) {
                    int chr = ptopm.getChr(index, k);
                    byte strand = ptopm.getStrand(index, k);
                    int startPos = ptopm.getStartPos(index, k);
                    short mappingScore = ptopm.getScore(index, k);
                    byte divergence = ptopm.getDivergence(index, k);
                    TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, Integer.MIN_VALUE, divergence, Byte.MIN_VALUE, forwardMappingSource, mappingScore);
                    forwardBuffer[k][j-startIndex] = theTMI;
                }
                max = ptopm.getMappingNum(ptopm.getPairIndex(index));
                if (max > maxMappingNum) max = maxMappingNum;
                for (int k = 0; k < max; k++) {
                    int chr = ptopm.getChr(ptopm.getPairIndex(index), k);
                    byte strand = ptopm.getStrand(ptopm.getPairIndex(index), k);
                    int startPos = ptopm.getStartPos(ptopm.getPairIndex(index), k);
                    short mappingScore = ptopm.getScore(ptopm.getPairIndex(index), k);
                    byte divergence = ptopm.getDivergence(ptopm.getPairIndex(index), k);
                    TagMappingInfoV3 theTMI = new TagMappingInfoV3(chr, strand, startPos, Integer.MIN_VALUE, divergence, Byte.MIN_VALUE, backMappingSource, mappingScore);
                    backBuffer[k][j-startIndex] = theTMI;
                } 
            }
            this.saveTMIBufferToTOPM(forwardBuffer, forwardDataSetNames, i);
            this.saveTMIBufferToTOPM(backBuffer, backwardDataSetNames, i);
            System.out.println("Chunk " + i + "(index) with " + topm.getChunkSize() + " tags is annotated");
        }
    }
    
    /**
     * Annotate the TOPM with whole genome genetic mapping
     * @param TOGMFileS
     * @param maxMappingNum 
     */
    public void annotateWithGMGW (String TOGMFileS, int maxMappingNum) {
        TagsOnGeneticMap togm = new TagsOnGeneticMap(TOGMFileS, FilePacking.Text);
        TagGeneticMappingInfo[] gmChunk;
        String dataSetName = topm.creatTagGeneticMappingInfoGWDataset();
        for (int i = 0; i < topm.getChunkNum(); i++) {
            gmChunk = new TagGeneticMappingInfo[topm.getChunkSize()];
            for (int j = 0; j < topm.getChunkSize(); j++) {
                gmChunk[j] = new TagGeneticMappingInfo();
            }
            int startIndex = i*topm.getChunkSize();
            int endIndex = startIndex+topm.getChunkSize();
            if (endIndex > topm.getTagCount()) endIndex = topm.getTagCount();
            for (int j = startIndex; j < endIndex; j++) {
                long[] t = topm.getTag(j);
                int index = togm.getTagIndex(t);
                if (index < 0) continue;
                int chr = togm.getGChr(index);
                int position = togm.getGPos(index); //rough pos in GM
                TagGeneticMappingInfo tgmi = new TagGeneticMappingInfo(Double.NEGATIVE_INFINITY, chr, position, Integer.MIN_VALUE, Integer.MIN_VALUE);
                gmChunk[j-startIndex] = tgmi;     
            }
            topm.writeTagGeneticMappingInfoGWDataSet(dataSetName, gmChunk, i);
            if (i%100 == 0) System.out.println("Chunk " + i + "(index) with " + topm.getChunkSize() + " tags is annotated with genome wide genetic mapping");
        }
    }
    
    /**
     * Save mapping info (from BWA) in a buffer to TOPM
     * BWA doesn't provide mapping score, so the rank is just based on the order provided for multiple hits
     * @param tmiBuffer
     * @param dataSetNames
     * @param chunkIndex
     * @param mappingSource 
     */
    private void saveBWATMIBufferToTOPM (TagMappingInfoV3[][] tmiBuffer, String[] dataSetNames, int chunkIndex, byte mappingSource) {
        for (int i = 0; i < tmiBuffer.length; i++) {
            for (int j = 0; j < tmiBuffer[i].length; j++) {
                if (tmiBuffer[i][j] != null) continue;
                tmiBuffer[i][j] = new TagMappingInfoV3();
                tmiBuffer[i][j].setMappingSource(mappingSource);
            }
        }
        for (int i = 0; i < tmiBuffer.length; i++) {
            for (int j = 0; j < tmiBuffer[i].length; j++) {
                tmiBuffer[i][j].setMappingRank((byte)i);
            }
        }
        topm.writeTagMappingInfoDataSets(dataSetNames, tmiBuffer, chunkIndex);
    }
    
    
    /**
     * Save mapping info in a buffer to TOPM. This works for PE and Genetic mapping. When mappingSource == Byte.MIN_VALUE, it means the tag doesn't exist in PE list or Genetic mapping
     * @param tmiBuffer
     * @param dataSetNames
     * @param chunkIndex
     */
    private void saveTMIBufferToTOPM (TagMappingInfoV3[][] tmiBuffer, String[] dataSetNames, int chunkIndex) {
        for (int i = 0; i < tmiBuffer[0].length; i++) {
            int sum = 0;
            for (int j = 0; j < tmiBuffer.length; j++) {
                if (tmiBuffer[j][i].mappingSource == Byte.MIN_VALUE) sum++;
            }
            if (sum == tmiBuffer.length) continue;
            TreeSet set = new TreeSet();
            for (int j = 0; j < tmiBuffer.length; j++) {
                set.add(tmiBuffer[j][i].mappingScore);
            }
            Short[] sA = set.toArray(new Short[set.size()]);
            byte[] rank = new byte[tmiBuffer.length];
            for (int j = 0; j < rank.length; j++) {
                rank[j] = (byte)(sA.length - Arrays.binarySearch(sA, tmiBuffer[j][i].mappingScore) -1);
                tmiBuffer[j][i].setMappingRank(rank[j]);
            }
        }
        topm.writeTagMappingInfoDataSets(dataSetNames, tmiBuffer, chunkIndex);
    }
    
    /**
     * Save mapping info in a buffer to TOPM. This works for bowtie2, blast and bwamem
     * @param tmiBuffer
     * @param dataSetNames
     * @param chunkIndex
     * @param mappingSource 
     */
    private void saveTMIBufferToTOPM (TagMappingInfoV3[][] tmiBuffer, String[] dataSetNames, int chunkIndex, byte mappingSource) {
        for (int i = 0; i < tmiBuffer.length; i++) {
            for (int j = 0; j < tmiBuffer[i].length; j++) {
                if (tmiBuffer[i][j] != null) continue;
                tmiBuffer[i][j] = new TagMappingInfoV3();
                tmiBuffer[i][j].setMappingSource(mappingSource);
            }
        }
        for (int i = 0; i < tmiBuffer[0].length; i++) {
            TreeSet set = new TreeSet();
            for (int j = 0; j < tmiBuffer.length; j++) {
                set.add(tmiBuffer[j][i].mappingScore);
            }
            Short[] sA = set.toArray(new Short[set.size()]);
            byte[] rank = new byte[tmiBuffer.length];
            for (int j = 0; j < rank.length; j++) {
                rank[j] = (byte)(sA.length - Arrays.binarySearch(sA, tmiBuffer[j][i].mappingScore) -1);
                tmiBuffer[j][i].setMappingRank(rank[j]);
            }
        }
        topm.writeTagMappingInfoDataSets(dataSetNames, tmiBuffer, chunkIndex);
    }
    
    /**
     * Update TMI buffers. After the buffers[0] is written to TOPM, buffers[i] moves to buffers[i-1]. This creats a new buffer at the end
     */
    private void updateTMIBuffer () {
        for (int i = 0; i < tmiBuffers.length-1; i++) {
            tmiBuffers[i] = tmiBuffers[i+1];
            bufferStartTagIndex[i] = bufferStartTagIndex[i+1];
            bufferLights[i] = bufferLights[i+1];
            lightCounts[i] = lightCounts[i+1];
        }
        tmiBuffers[tmiBuffers.length-1] = new TagMappingInfoV3[tmiBuffers[0].length][topm.getChunkSize()];
        bufferStartTagIndex[tmiBuffers.length-1]+=topm.getChunkSize();
        bufferLights[tmiBuffers.length-1] = new boolean[topm.getChunkSize()];
        lightCounts[tmiBuffers.length-1] = 0;
        this.calBufferTagIndexRange();
    }
    
    private TagMappingInfoV3[][] getPopulateTMIBuffer (int maxMappingNum) {
        TagMappingInfoV3[][] tmiBuffer = new TagMappingInfoV3[maxMappingNum][topm.getChunkSize()] ;
        for (int i = 0; i < tmiBuffer.length; i++) {
            for (int j = 0; j < tmiBuffer[i].length; j++) {
                tmiBuffer[i][j] = new TagMappingInfoV3();
            }
        }
        return tmiBuffer;
    }
    
    /**
     * Initialize TagMappingInfo buffers
     * @param bufferNum number of buffers
     * @param maxMappingNum maximum mapping information which will be held
     */
    private void iniTMIBuffers (int bufferNum, int maxMappingNum) {
        tmiBuffers = new TagMappingInfoV3[bufferNum][maxMappingNum][topm.getChunkSize()];
        bufferStartTagIndex = new int[bufferNum];
        for (int i = 0; i < bufferNum; i++) {
            bufferStartTagIndex[i] = i*topm.getChunkSize();
        }
        this.calBufferTagIndexRange();
        this.bufferLights = new boolean[bufferNum][topm.getChunkSize()];
        this.lightCounts = new int[bufferNum];
        updateBufferCountCutoff = (int)((double)topm.getChunkSize() * 0.2);
    }
    
    /**
     * Calculate the tag index range in these buffers
     */
    private void calBufferTagIndexRange () {
        this.bufferTagIndexRange = new int[2];
        bufferTagIndexRange[0] = this.bufferStartTagIndex[0];
        bufferTagIndexRange[1] = this.bufferStartTagIndex[tmiBuffers.length-1]+topm.getChunkSize();
    }
    
    
    private int getMappingDatasetIndex (int bufferIndex, int bufferTagIndex) {
        for (int i = 0; i < tmiBuffers[0].length; i++) {
            if (tmiBuffers[bufferIndex][i][bufferTagIndex] == null) {
                return i;
            }
        }
        return Integer.MIN_VALUE;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy