All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.ParseBarcodeRead Maven / Gradle / Ivy

package net.maizegenetics.analysis.gbs;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;

import net.maizegenetics.dna.BaseEncoder;

/**
 * Takes a key file and then sets up the methods to decode a read from the sequencer.
 * The key file decribes how barcodes are related to their taxon.  Generally, a keyfile
 * with all flowcells is included, and then the flowcell and lane to be processed are
 * indicated in the constructor.
 *
 * @author Ed Buckler, Jeff Glaubitz, and James Harriman
 *
 */
public class ParseBarcodeRead {

    private static int chunkSize = BaseEncoder.chunkSize;
    protected int maximumMismatchInBarcodeAndOverhang = 0;
    protected static String[] initialCutSiteRemnant = null;
    protected static int readEndCutSiteRemnantLength;
    static String nullS = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
    protected static String[] likelyReadEnd = null;
    protected static String theEnzyme = null;
    static int maxBarcodeLength = 10;
    private Barcode[] theBarcodes;
    private long[] quickBarcodeList;
    private HashMap quickMap;

    /**
     * Create the barcode parsing object
     * @param keyFile file location for the keyfile
     * @param enzyme name of the enzyme
     * @param flowcell name of the flowcell to be processed
     * @param lane name of the lane to be processed
     */
    public ParseBarcodeRead(String keyFile, String enzyme, String flowcell, String lane) {
        if (enzyme != null) {
            chooseEnzyme(enzyme);
        } else {
            chooseEnzyme(getKeyFileEnzyme(keyFile));
        }

        int totalBarcodes = setupBarcodeFiles(new File(keyFile), flowcell, lane);
        System.out.println("Total barcodes found in lane:" + totalBarcodes);
    }

    /**
     * Determines which cut sites to look for, and sets them, based on the
     * enzyme used to generate the GBS library. For two-enzyme GBS both enzymes
     * MUST be specified and separated by a dash "-". e.g. PstI-MspI, SbfI-MspI
     *
     * @param enzyme The name of the enzyme (case insensitive)
     */
    //TODO these should all be private static final globals, then just use this set which one is active.
    public static void chooseEnzyme(String enzyme) {
        // Check for case-insensitive (?i) match to a known enzyme
        // The common adapter is: [readEndCutSiteRemnant]AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG  
        if (enzyme.matches("(?i)apek[i1]")) {
            theEnzyme = "ApeKI";
            initialCutSiteRemnant = new String[]{"CAGC", "CTGC"};
            likelyReadEnd = new String[]{"GCAGC", "GCTGC", "GCAGAGAT", "GCTGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)pst[i1]")) {
            theEnzyme = "PstI";
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"CTGCAG", "CTGCAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)ecot22[i1]")) {
            theEnzyme = "EcoT22I";
            initialCutSiteRemnant = new String[]{"TGCAT"};
            likelyReadEnd = new String[]{"ATGCAT", "ATGCAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)pas[i1]")) {
            theEnzyme = "PasI";
            initialCutSiteRemnant = new String[]{"CAGGG", "CTGGG"};
            likelyReadEnd = new String[]{"CCCAGGG", "CCCTGGG", "CCCTGAGAT", "CCCAGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)hpaii|(?i)hpa2")) {
            theEnzyme = "HpaII";
            initialCutSiteRemnant = new String[]{"CGG"};
            likelyReadEnd = new String[]{"CCGG", "CCGAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)msp[i1]")) {
            theEnzyme = "MspI";  // MspI and HpaII are isoschizomers (same recognition seq and overhang)
            initialCutSiteRemnant = new String[]{"CGG"};
            likelyReadEnd = new String[]{"CCGG", "CCGAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)pst[i1]-apek[i1]")) {
            theEnzyme = "PstI-ApeKI";
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"GCAGC", "GCTGC", "CTGCAG", "GCAGAGAT", "GCTGAGAT"}; // look for ApeKI site, PstI site, or common adapter for ApeKI
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)pst[i1]-bfa[i1]")) {
            theEnzyme = "PstI-BfaI";  //  PstI: CTGCA^G  BfaI: C^TAG
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"CTGCAG", "CTAG", "CTAAGATC"}; // look for PstI site, BfaI site, or common adapter start for BfaI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)pst[i1]-ecot22[i1]")) {
            theEnzyme = "PstI-EcoT22I";
            initialCutSiteRemnant = new String[]{"TGCAG", "TGCAT"};
            likelyReadEnd = new String[]{"ATGCAT", "CTGCAG", "CTGCAAGAT", "ATGCAAGAT"}; // look for EcoT22I site, PstI site, or common adapter for PstI/EcoT22I
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)pst[i1]-msp[i1]")) {
            theEnzyme = "PstI-MspI";
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"CCGG", "CTGCAG", "CCGAGATC"}; // look for MspI site, PstI site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)pst[i1]-msp[i1]-GDFcustom")) {
            theEnzyme = "PstI-MspI-GDFcustom";
            initialCutSiteRemnant = new String[]{"TGCAG"};
            // changed from  CCGAGAT to CCGCTCAGG, as IGD/GDF used a custom Y adapter for MspI
            likelyReadEnd = new String[]{"CCGG", "CTGCAG", "CCGCTCAGG"}; // look for MspI site, PstI site, or GDF custom common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)pst[i1]-taq[i1]")) {
            theEnzyme = "PstI-TaqI";
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"TCGA", "CTGCAG", "TCGAGATC"}; // look for TaqI site, PstI site, or common adapter for TaqI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)nsi[i1]-msp[i1]")) {
            theEnzyme = "NsiI-MspI";  //  ATGCA^T   C^CGG
            initialCutSiteRemnant = new String[]{"TGCAT"};
            likelyReadEnd = new String[]{"CCGG", "ATGCAT", "CCGAGATC"}; // look for MspI site, NsiI site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if(enzyme.matches("(?i)PaeR7[i1]-Hha[i1]")) {
            theEnzyme = "PaeR7I-HhaI";
            // Requested by Ye, Songqing, use same Y adapter as Polland paper  -QS
            initialCutSiteRemnant=new String[]{"TCGAG"};
            likelyReadEnd = new String[]{"GCGC", "CTCGAG", "GCGAGATC"}; // look for HhaI site, PaeR7I site, or common adapter for HhaI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)sbf[i1]-msp[i1]")) {
            theEnzyme = "SbfI-MspI";  // CCTGCA^GG  C^CGG
            initialCutSiteRemnant = new String[]{"TGCAGG"};
            likelyReadEnd = new String[]{"CCGG", "CCTGCAGG", "CCGAGATC"}; // look for MspI site, SbfI site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)sbf[i1]-hpaii|(?i)sbf[i1]-hpa2")) {
            theEnzyme = "SbfI-HpaII";  // CCTGCA^GG  C^CGG  Nb: HpaII is an isoschizomer of MspI
            initialCutSiteRemnant = new String[]{"TGCAGG"};
            likelyReadEnd = new String[]{"CCGG", "CCTGCAGG", "CCGAGATC"}; // look for HpaII site, SbfI site, or common adapter for HpaII
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)sbf[i1]-bfa[i1]")) {
            theEnzyme = "SbfI-BfaI";  // CCTGCA^GG  C^TAG
            initialCutSiteRemnant = new String[]{"TGCAGG"};
            likelyReadEnd = new String[]{"CTAG", "CCTGCAGG", "CTAAGATC"}; // look for BfaI site, SbfI site, or common adapter for BfaI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)sph[i1]-ecor[i1]")) {
            theEnzyme = "SphI-EcoRI";  // GCATG^C  G^AATTC
            initialCutSiteRemnant = new String[]{"CATGC"};  // SphI overhang from the genomic DNA fragment (top strand)
            likelyReadEnd = new String[]{"GCATGC", "GAATTC", "GAATTAGATC"}; // look for SphI site, EcoRI site, or common adapter for EcoRI
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)asis[i1]-msp[i1]")) {
            theEnzyme = "AsiSI-MspI";
            initialCutSiteRemnant = new String[]{"ATCGC"};
            likelyReadEnd = new String[]{"CCGG", "GCGATCGC", "CCGAGATC"}; // look for MspI site, AsiSI site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)bsshii-msp[i1]|(?i)bssh2-msp[i1]")) {
            theEnzyme = "BssHII-MspI";
            initialCutSiteRemnant = new String[]{"CGCGC"};
            likelyReadEnd = new String[]{"CCGG", "GCGCGC", "CCGAGATC"}; // look for MspI site, BssHII site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)fse[i1]-msp[i1]")) {
            theEnzyme = "FseI-MspI";
            initialCutSiteRemnant = new String[]{"CCGGCC"};
            likelyReadEnd = new String[]{"CCGG", "GGCCGGCC", "CCGAGATC"}; // look for MspI site, FseI site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)sal[i1]-msp[i1]")) {
            theEnzyme = "SalI-MspI";
            initialCutSiteRemnant = new String[]{"TCGAC"};
            likelyReadEnd = new String[]{"CCGG", "GTCGAC", "CCGAGATC"}; // look for MspI site, SalI site, or common adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)ecor[i1]-msp[i1]")) {
            theEnzyme = "EcoRI-MspI";   //  G^AATTC  C^CGG
            initialCutSiteRemnant = new String[]{"AATTC"};
            likelyReadEnd = new String[]{"CCGG", "GAATTC", "CCGAGATC"}; // look for MspI site, EcoRI site, or Poland et al. 2012 Y-adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)hindiii-msp[i1]|(?i)hind3-msp[i1]")) {
            theEnzyme = "HindIII-MspI"; // A^AGCTT   C^CGG
            initialCutSiteRemnant = new String[]{"AGCTT"};
            likelyReadEnd = new String[]{"CCGG", "AAGCTT", "CCGAGATC"}; // look for MspI site, HindIII site, or Poland et al. 2012 Y-adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)hindiii-nlaiii|(?i)hind3-nla3")) {
            theEnzyme = "HindIII-NlaIII"; // A^AGCTT   ^CATG (not blunt)
            initialCutSiteRemnant = new String[]{"AGCTT"};
            likelyReadEnd = new String[]{"CATG", "AAGCTT", "CATGAGATC"}; // look for NlaIII site, HindIII site, or Poland et al. 2012 Y-adapter for Nla3
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)sexa[i1]-sau3a[i1]")) {
            theEnzyme = "SexAI-Sau3AI";  // A^CCWGGT   ^GATC (not blunt)
            initialCutSiteRemnant = new String[]{"CCAGGT", "CCTGGT"};
            likelyReadEnd = new String[]{"GATC", "ACCAGGT", "ACCTGGT", "GATCAGATC"}; // look for SexAI site, Sau3AI site, or Poland et al. 2012 Y-adapter for Sau3AI
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)bamh[i1l]-mluc[i1]")) {
            theEnzyme = "BamHI-MluCI";  // G^GATCC   ^AATT (not blunt)
            initialCutSiteRemnant = new String[]{"GATCC"};
            likelyReadEnd = new String[]{"AATT", "GGATCC", "AATTAGATC"}; // look for MluCI site, BamHI site, or Poland et al. 2012 Y-adapter for MluCI
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)pst[i1]-mluc[i1]")) {
            theEnzyme = "PstI-MluCI";  // CTGCA^G   ^AATT (not blunt)
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"AATT", "CTGCAG", "AATTAGATC"}; // look for MluCI site, PstI site, or Poland et al. 2012 Y-adapter for MluCI
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)psti-msei|(?i)pst1-mse1")) {
            theEnzyme = "PstI-MseI"; // CTGCA^G   T^TAA
            initialCutSiteRemnant = new String[]{"TGCAG"};
            likelyReadEnd = new String[]{"TTAA", "CTGCAG", "TTAAGATC"}; // look for MseI site, PstI site, or Poland et al. 2012 Y-adapter for MseI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)avaii-msei|(?i)ava2-mse1")) {
            theEnzyme = "AvaII-MseI"; // G^GWCC   T^TAA  W=AorT
            initialCutSiteRemnant = new String[]{"GACC", "GTCC"};
            likelyReadEnd = new String[]{"TTAA", "GGACC", "GGTCC", "TTAAGATC"}; // look for MseI site, AvaII site, or Poland et al. 2012 Y-adapter for MseI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)ecori-msei|(?i)ecor1-mse1")) {
            theEnzyme = "EcoRI-MseI"; // G^AATTC   T^TAA
            initialCutSiteRemnant = new String[]{"AATTC"};
            likelyReadEnd = new String[]{"TTAA", "GAATTC", "TTAAGATC"}; // look for MseI site, EcoRI site, or Poland et al. 2012 Y-adapter for MseI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)ecori-avaii|(?i)ecor1-ava2")) {
            theEnzyme = "EcoRI-AvaII"; // G^AATTC   G^GWCC
            initialCutSiteRemnant = new String[]{"AATTC"};
            likelyReadEnd = new String[]{"GGACC", "GGTCC", "GAATTC", "GGACAGATC", "GGTCAGATC"}; // look for AvaII site, EcoRI site, or Poland et al. 2012 Y-adapter for AvaII
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)ecori-hinfi|(?i)ecor1-hinf1")) {
            theEnzyme = "EcoRI-HinfI"; // G^AATTC   G^ANTC
            initialCutSiteRemnant = new String[]{"AATTC"};
            likelyReadEnd = new String[]{"GAATC", "GACTC", "GAGTC", "GATTC", "GAATTC", "GAATAGATC", "GACTAGATC", "GAGTAGATC", "GATTAGATC"}; // look for HinfI site, EcoRI site, or Poland et al. 2012 Y-adapter for HinfI
            readEndCutSiteRemnantLength = 4;
        } else if (enzyme.matches("(?i)bbvci-mspi|(?i)bbvc1-msp1")) {
            theEnzyme = "BbvCI-MspI"; // CCTCAGC (-5/-2)   C^CGG
            initialCutSiteRemnant = new String[]{"TCAGC"};
            likelyReadEnd = new String[]{"CCGG", "CCTCAGC", "CCGAGATC"}; // look for MspI site, BbvCI site, or Poland et al. 2012 Y-adapter for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)msp[i1]-apek[i1]")) {
            theEnzyme = "MspI-ApeKI";  // C^CGG  G^CWGC
            initialCutSiteRemnant = new String[]{"CGG", "CAGC", "CTGC"};
            likelyReadEnd = new String[]{"CCGG", "GCAGC", "GCTGC", "CCGAGATCGG", "GCAGAGAT", "GCTGAGAT"}; 
            readEndCutSiteRemnantLength = 3; 
        } else if(enzyme.matches("(?i)apo[i1]")){
            theEnzyme = "ApoI";
            initialCutSiteRemnant=new String[]{"AATTC","AATTT"};
            likelyReadEnd = new String[]{"AAATTC","AAATTT","GAATTC","GAATTT","AAATTAGAT","GAATTAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)BamH[i1l]")) {
            theEnzyme = "BamHI";
            initialCutSiteRemnant = new String[]{"GATCC"};
            likelyReadEnd = new String[]{"GGATCC", "GGATCAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            // likelyReadEnd = new String[]{"GGATCC", "AGATCGGAA", "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"}; // <-- corrected from this by Jeff Glaubitz on 2012/09/12
            readEndCutSiteRemnantLength = 5;
        } else if (enzyme.matches("(?i)mse[i1]")) {
            theEnzyme = "MseI";
            initialCutSiteRemnant = new String[]{"TAA"};
            likelyReadEnd = new String[]{"TTAA", "TTAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if(enzyme.matches("(?i)Sau3A[i1]")){
            theEnzyme = "Sau3AI";
            initialCutSiteRemnant=new String[]{"GATC"};
            likelyReadEnd = new String[]{"GATC","GATCAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 4;
        } else if(enzyme.matches("(?i)nde[i1]")){
            theEnzyme = "NdeI";
            initialCutSiteRemnant=new String[]{"TATG"};
            likelyReadEnd = new String[]{"CATATG","CATAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 4;
        } else if(enzyme.matches("(?i)hinp1[i1]")){
            theEnzyme = "HinP1I";
            initialCutSiteRemnant=new String[]{"CGC"};
            likelyReadEnd = new String[]{"GCGC","GCGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if(enzyme.matches("(?i)sbf[i1]")){
            theEnzyme = "SbfI";
            initialCutSiteRemnant=new String[]{"TGCAGG"};
            likelyReadEnd = new String[]{"CCTGCAGG","CCTGCAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 6;
        } else if (enzyme.matches("(?i)hindiii|(?i)hind3")) {
            theEnzyme = "HindIII"; // A^AGCTT
            initialCutSiteRemnant=new String[]{"AGCTT"};
            likelyReadEnd = new String[]{"AAGCTT","AAGCTAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if(enzyme.matches("(?i)ecor[i1]")) {
            theEnzyme = "EcoRI";  // G^AATTC
            initialCutSiteRemnant= new String[]{"AATTC"};
            likelyReadEnd = new String[]{"GAATTC","GAATTAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if(enzyme.matches("(?i)cviq[i1]")){  
            theEnzyme = "CviQI";  // CviQI and Csp6I are isoschizomers (same recognition seq and overhang)
            initialCutSiteRemnant=new String[]{"TAC"};
            likelyReadEnd = new String[]{"GTAC","GTAAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if(enzyme.matches("(?i)csp6[i1]")){  
            theEnzyme = "Csp6I";  // Csp6I and CviQI are isoschizomers (same recognition seq and overhang)
            initialCutSiteRemnant=new String[]{"TAC"};
            likelyReadEnd = new String[]{"GTAC","GTAAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)nlaiii|(?i)nla3")) {
            theEnzyme = "NlaIII"; // CATG^
            initialCutSiteRemnant=new String[]{"CATG"};
            likelyReadEnd = new String[]{"CATG","CATGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 4;
        } else if(enzyme.matches("(?i)sph[i1]")){
            theEnzyme = "SphI";  // GCATG^C
            initialCutSiteRemnant=new String[]{"CATGC"};
            likelyReadEnd = new String[]{"GCATGC","GCATGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if(enzyme.matches("(?i)nsp[i1]")){
            theEnzyme = "NspI";  // RCATG^Y
            initialCutSiteRemnant=new String[]{"CATGC","CATGT"};
            likelyReadEnd = new String[]{"ACATGT","GCATGC","ACATGAGAT","GCATGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if(enzyme.matches("(?i)kpn[i1]")){
            theEnzyme = "KpnI";  // GGTAC^C
            initialCutSiteRemnant=new String[]{"GTACC"};
            likelyReadEnd = new String[]{"GGTACC","GGTACAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if(enzyme.matches("(?i)sty[i1]")){
            theEnzyme = "StyI";  // C^CWWGG
            initialCutSiteRemnant=new String[]{"CAAGG","CATGG","CTAGG","CTTGG"};
            likelyReadEnd = new String[]{"CCAAGG","CCATGG","CCTAGG","CCTTGG","CCAAGAGAT","CCATGAGAT","CCTAGAGAT","CCTTGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 5;
        } else if(enzyme.matches("(?i)styi-msei|(?i)sty1-mse1")){
            theEnzyme = "StyI-MseI";  // C^CWWGG & T^TAA
            initialCutSiteRemnant=new String[]{"CAAGG","CATGG","CTAGG","CTTGG"};
            likelyReadEnd = new String[]{"TTAA","CCAAGG","CCATGG","CCTAGG","CCTTGG","TTAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if(enzyme.matches("(?i)fse[i1]")){
            theEnzyme = "FseI";  // GGCCGG^CC
            initialCutSiteRemnant=new String[]{"CCGGCC"};
            likelyReadEnd = new String[]{"GGCCGGCC","AGATCGGAAG"}; // full cut site (from partial digest or chimera) or Morishige et al (BMC Genomics, 2013) T adapter start
            readEndCutSiteRemnantLength = 0;  // assumes that common T adapter is far more likely than a second full cut site
        } else if(enzyme.matches("(?i)NgoMIV|(?i)NgoM4")){
            theEnzyme = "NgoMIV";  // G^CCGGC
            initialCutSiteRemnant=new String[]{"CCGGC"};
            likelyReadEnd = new String[]{"GCCGGC","AGATCGGAAG"}; // full cut site (from partial digest or chimera) or Morishige et al (BMC Genomics, 2013) T adapter start
            readEndCutSiteRemnantLength = 0;  // assumes that common T adapter is far more likely than a second full cut site
        } else if(enzyme.matches("(?i)msl[i1]")){
            theEnzyme = "MslI";  // CAYNN^NNRTG  -- has 32 different cut sites (assuming constrained to palindromic YNN -- 32^2 otherwise)
            initialCutSiteRemnant=new String[]{""};
            likelyReadEnd = new String[]{"AGATCGGA"}; // common adapter start only (too many possible cut sites!)
            readEndCutSiteRemnantLength = 0;  
        } else if(enzyme.matches("(?i)ase[i1]")){
            theEnzyme = "AseI";  // AT^TAAT
            initialCutSiteRemnant=new String[]{"TAAT"};
            likelyReadEnd = new String[]{"ATTAAT","ATTAAGAT"};  // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 4;  
        } else if(enzyme.matches("(?i)avaii|(?i)ava2")){
            theEnzyme = "AvaII";  // G^GWCC
            initialCutSiteRemnant=new String[]{"GACC","GTCC"};
            likelyReadEnd = new String[]{"GGACC","GGTCC","GGACAGAT","GGTCAGAT"};  // full cut site (from partial digest or chimera) or common adapter start
            readEndCutSiteRemnantLength = 4;  
        } else if (enzyme.matches("(?i)kpn[i1]-msp[i1]")) {
            theEnzyme = "KpnI-MspI";  //  KpnI: GGTAC^C  MspI: C^CGG
            initialCutSiteRemnant = new String[]{"GTACC"};
            likelyReadEnd = new String[]{"CCGG", "GGTACC", "CCGAGATC"}; // look for MspI site, KpnI site, or common adapter start for MspI
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)RBSTA")) {
            theEnzyme = "RBSTA";
            initialCutSiteRemnant = new String[]{"TA"};
            likelyReadEnd = new String[]{"TTAA", "GTAC", "CTAG", "TTAAGAT", "GTAAGAT", "CTAAGAT"}; // full cut site (from partial digest or chimera) of MseI, CVIQi, XspI or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if (enzyme.matches("(?i)RBSCG")) {
            theEnzyme = "RBSCG";
            initialCutSiteRemnant = new String[]{"CG"};
            likelyReadEnd = new String[]{"CCGC", "TCGA", "GCGC", "CCGG", "ACGT", "CCGAGAT", "TCGAGAT", "GCGAGAT", "ACGAGAT"}; // full cut site (from partial digest or chimera) of AciI, TaqaI, HinpI, HpaII, HpyCH4IV or common adapter start
            readEndCutSiteRemnantLength = 3;
        } else if(enzyme.matches("(?i)ignore")){
            theEnzyme = "unspecified";  // can be used for new enzymes -- only looks for barcodes and common adapter starts
            initialCutSiteRemnant=new String[]{""};
            likelyReadEnd = new String[]{"AGATCGGA"}; // common adapter start only
            readEndCutSiteRemnantLength = 0;  
        } else {
            System.out.println("The software didn't recognize your restriction enzyme (-e option).\n"
                    +"Currently, only the following enzymes are recognized for single enzyme digests:\n"
                    +"  ApeKI"    +"\n"
                    +"  ApoI"     +"\n"
                    +"  AseI"     +"\n"
                    +"  AvaII"    +"\n"
                    +"  BamHI"    +"\n"
                    +"  Csp6I"    +"\n"
                    +"  CviQI"    +"\n"
                    +"  EcoRI"    +"\n"
                    +"  EcoT22I"  +"\n"
                    +"  FseI"     +"\n"
                    +"  HindIII"  +"\n"
                    +"  HinP1I"   +"\n"
                    +"  HpaII"    +"\n"
                    +"  KpnI"     +"\n"
                    +"  MseI"     +"\n"
                    +"  MslI"     +"\n"
                    +"  MspI"     +"\n"
                    +"  NdeI"     +"\n"
                    +"  NgoMIV"   +"\n"
                    +"  NlaIII"   +"\n"
                    +"  NspI"     +"\n"
                    +"  PasI"     +"\n"
                    +"  PstI"     +"\n"
                    +"  Sau3AI"   +"\n"
                    +"  SbfI"     +"\n"
                    +"  SphI"     +"\n"
                    +"  StyI"     +"\n"
                    +"  RBSTA"    +"\n"
                    +"  RBSCG"    +"\n"
                    +"  ignore"    +"\n"
                    +"Or the following for two-enzyme digests:\n"
                    +"  AsiSI-MspI"   +"\n"
                    +"  AvaII-MseI"   +"\n"
                    +"  BamHI-MluCI"  +"\n"
                    +"  BbvCI-MspI"   +"\n"
                    +"  BssHII-MspI"  +"\n"
                    +"  EcoRI-AvaII"  +"\n"
                    +"  EcoRI-HinfI"  +"\n"
                    +"  EcoRI-MseI"   +"\n"
                    +"  EcoRI-MspI"   +"\n"
                    +"  FseI-MspI"    +"\n"
                    +"  HindIII-MspI" +"\n"
                    +"  HindIII-NlaIII" +"\n"
                    +"  KpnI-MspI"    +"\n"
                    +"  MspI-ApeKI"   +"\n"
                    +"  NsiI-MspI"    +"\n"
                    +"  PaeR7I-HhaI"  +"\n"
                    +"  PstI-ApeKI"   +"\n"
                    +"  PstI-BfaI"    +"\n"
                    +"  PstI-EcoT22I" +"\n"
                    +"  PstI-MluCI"   +"\n"
                    +"  PstI-MseI"    +"\n"
                    +"  PstI-MspI"    +"\n"
                    +"  PstI-MspI-GDFcustom"+"\n"
                    +"  PstI-TaqI"    +"\n"
                    +"  SbfI-BfaI"    +"\n"
                    +"  SbfI-HpaII"   +"\n"
                    +"  SalI-MspI"    +"\n"
                    +"  SbfI-MspI"    +"\n"
                    +"  SexAI-Sau3AI" +"\n"
                    +"  SphI-EcoRI"   +"\n"
                    +"  StyI-MseI"    +"\n"
                    +"  ignore"    +"\n"
            );
            System.out.println("For two-enzyme digest, enzyme names should be separated by a dash, e.g. -e PstI-MspI");
            System.out.println("\nIf your enzyme is not on the above list you can use \"-e ignore\". In this case\n"
                    +"   barcodes and common adapter start sequences will be recognized, but chimeric DNA\n"
                    +"   fragments (or partial digests) will not be trimmed.");
        }
        System.out.println("Enzyme: " + theEnzyme);
    }

    private String getKeyFileEnzyme(String keyFileName) {
        String result = null;
        try {
            BufferedReader br = new BufferedReader(new FileReader(keyFileName), 65536);

            String temp;
            int currLine = 0;
            while (((temp = br.readLine()) != null)) {
                String[] s = temp.split("\\t");  //split by whitespace
                if (currLine > 0) {
                    String enzymeName;
                    if (s.length < 9) {
                        enzymeName = "";
                    } else {
                        enzymeName = s[8];
                    }
                    if (!enzymeName.equals("")) {
                        result = enzymeName;
                        break;
                    }
                }
                currLine++;
            }
        } catch (Exception e) {
            System.out.println("Couldn't open key file to read Enzyme: " + e);
        }
        return result;
    }

    /**
     * Reads in an Illumina key file, creates a linear array of {@link Barcode} objects
     * representing the barcodes in the key file, then creates a hash map containing
     * indices from the linear array indexed by sequence.  The names of barcode objects
     * follow the pattern samplename:flowcell:lane:LibraryPrepID, since sample names alone are not unique.
     *
     * @param keyFile Illumina key file.
     * @param flowcell Only barcodes from this flowcell will be added to the array.
     * @param lane Only barcodes from this lane will be added to the array.
     * @return Number of barcodes in the array.  
     */
    private int setupBarcodeFiles(File keyFile, String flowcell, String lane) {
        try {
            BufferedReader br = new BufferedReader(new FileReader(keyFile), 65536);
            ArrayList theBarcodesArrayList = new ArrayList();
            String temp;
            while (((temp = br.readLine()) != null)) {
                String[] s = temp.split("\\t");  //split by whitespace
                Barcode theBC = null;
                if (s[0].equals(flowcell) && s[1].equals(lane)) {
                    if (s.length < 8 || s[7] == null || s[7].trim().equals("")) {  // column H (LibraryPrepID) is required for TASSEL5
                        throw new IllegalArgumentException("Column H of the key file must contain a LibraryPrepID (integer or alphanumeric)");
                    } else {  // use the "libraryPrepID" or whatever is in column H of the key file
                        theBC = new Barcode(s[2], initialCutSiteRemnant, s[3]+":"+s[0]+":"+s[1]+":"+ s[7],-1,flowcell,lane);
                    }
                    theBarcodesArrayList.add(theBC);
                    System.out.println(theBC.barcodeS + " " + theBC.taxaName);
                }
            }
            theBarcodes = new Barcode[theBarcodesArrayList.size()];
            theBarcodesArrayList.toArray(theBarcodes);
            Arrays.sort(theBarcodes);
            int nBL = theBarcodes[0].barOverLong.length;
            quickBarcodeList = new long[theBarcodes.length * nBL];
            quickMap = new HashMap();
            for (int i = 0; i < theBarcodes.length; i++) {
                for (int j = 0; j < nBL; j++) {
                    quickBarcodeList[i * nBL + j] = theBarcodes[i].barOverLong[j];
                    quickMap.put(theBarcodes[i].barOverLong[j], i);
                }
            }
            Arrays.sort(quickBarcodeList);
        } catch (Exception e) {
            System.out.println("Error with setupBarcodeFiles: " + e);
        }
        return theBarcodes.length;
    }

    /**
     * Returns the best barcode match for a given sequence.  
     * @param queryS query sequence to be tested against all barcodes
     * @param maxDivergence maximum divergence to permit
     * @return best barcode match (null if no good match)
     */
    Barcode findBestBarcode(String queryS, int maxDivergence) {
        long query = BaseEncoder.getLongFromSeq(queryS.substring(0, chunkSize));
        //note because the barcodes are polyA after the sequence, they should always
        //sort ahead of the hit, this is the reason for the -(closestHit+2)
        int closestHit = Arrays.binarySearch(quickBarcodeList, query);

        /*      THIS IS THE NEW PIPELINE APPROACH THAT DOES NOT WORK
         if(closestHit>-2) return null; //hit or perfect
         if((query&quickBarcodeList[-(closestHit+2)])!=quickBarcodeList[-(closestHit+2)]) return null;
         int index =quickMap.get(quickBarcodeList[-(closestHit+2)]);
         //      System.out.println(theBarcodes[index].barcodeS);
         return theBarcodes[index];
         //note to see if it is a perfect match you can just bit AND
         */

        //  Below is the old pipeline approach, which works (at least for maxDivergence of 0)
        if (closestHit < -1) {  // should always be true, as the barcode+overhang is padded to 32 bases with polyA
            int index = quickMap.get(quickBarcodeList[-(closestHit + 2)]);
            if (theBarcodes[index].compareSequence(query, 1) == 0) {
                return theBarcodes[index];
            } else if (maxDivergence == 0) {
                return null;  // return null if not a perfect match
            }
        } else {
            return null;  // should never go to this line
        }
        int maxLength = 0, minDiv = maxDivergence + 1, countBest = 0;
        Barcode bestBC = null;
        for (Barcode bc : theBarcodes) {
            int div = bc.compareSequence(query, maxDivergence + 1);
            if (div <= minDiv) {
                if ((div < minDiv) || (bc.barOverLength > maxLength)) {
                    minDiv = div;
                    maxLength = bc.barOverLength;
                    bestBC = bc;
                    countBest = 1;
                } else {  //it is a tie, so return that not resolvable
                    bestBC = null;
                    countBest++;
                }
            }
        }
        return bestBC;
    }

    /**
     * The barcode libraries used for this study can include two types of
     * extraneous sequence at the end of reads. The first are chimeras created
     * with the free ends. These will recreate the restriction site. The second
     * are short regions (less than 64bp), so that will they will contain a
     * portion of site and the universal adapter. This finds the first of site
     * in likelyReadEnd, keeps the restriction site overhang and then sets
     * everything to polyA afterwards
     *
     * @param seq An unprocessed tag sequence.
     * @param maxLength The maximum number of bp in the processed sequence.
     * @return returnValue A ReadBarcodeResult object containing the unprocessed
     * tag, Cut site position, Processed tag, and Poly-A padded tag.
     */
    public static ReadBarcodeResult removeSeqAfterSecondCutSite(String seq, byte maxLength) {
        //this looks for a second restriction site or the common adapter start, and then turns the remaining sequence to AAAA
        int cutSitePosition = 9999;
        ReadBarcodeResult returnValue = new ReadBarcodeResult(seq);

        //Look for cut sites, starting at a point past the length of the initial cut site remnant that all reads begin with
        String match = null;
        for (String potentialCutSite : likelyReadEnd) {
            int p = seq.indexOf(potentialCutSite, 1);
            if ((p > 1) && (p < cutSitePosition)) {
                cutSitePosition = p;
                match = potentialCutSite;
            }
        }
        if (theEnzyme.equalsIgnoreCase("ApeKI") && cutSitePosition == 2
                && (match.equalsIgnoreCase("GCAGC") || match.equalsIgnoreCase("GCTGC"))) {  // overlapping ApeKI cut site: GCWGCWGC
            seq = seq.substring(3, seq.length());  // trim off the initial GCW from GCWGCWGC
            cutSitePosition = 9999;
            returnValue.unprocessedSequence = seq;
            for (String potentialCutSite : likelyReadEnd) {
                int p = seq.indexOf(potentialCutSite, 1);
                if ((p > 1) && (p < cutSitePosition)) {
                    cutSitePosition = p;
                }
            }
        }

        if (cutSitePosition < maxLength) {  // Cut site found
            //Trim tag to sequence up to & including the cut site
            returnValue.length = (byte) (cutSitePosition + readEndCutSiteRemnantLength);
            returnValue.processedSequence = seq.substring(0, cutSitePosition + readEndCutSiteRemnantLength);
        } else {
            if (seq.length() <= 0) {
                //If cut site is missing because there is no sequence
                returnValue.processedSequence = "";
                returnValue.length = 0;
            } else {
                //If cut site is missing because it is beyond the end of the sequence (or not present at all)
                returnValue.length = (byte) Math.min(seq.length(), maxLength);
                returnValue.processedSequence = (seq.substring(0, returnValue.length));
            }
        }

        //Pad sequences shorter than max. length with A
        if (returnValue.length < maxLength) {
            returnValue.paddedSequence = returnValue.processedSequence + nullS;
            returnValue.paddedSequence = returnValue.paddedSequence.substring(0, maxLength);
        } else {
            //Truncate sequences longer than max. length
            returnValue.paddedSequence = returnValue.processedSequence.substring(0, maxLength);
            returnValue.length = maxLength;
        }
        return returnValue;
    }

    /**
     * Return a {@link ReadBarcodeResult} that captures the processed read and taxa
     * inferred by the barcode
     * @param seqS DNA sequence from the sequencer
     * @param qualS quality score string from the sequencer
     * @param fastq (fastq = true?; qseq=false?)
     * @param minQual minimum quality score
     * @return If barcode and cut site was found returns the result and
     * processed sequence, if the barcode and cut site were not found return
     * null
     */
    public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, boolean fastq, int minQual) {
        long[] read = new long[2];
        if ((minQual > 0) && (qualS != null)) {
            int firstBadBase = BaseEncoder.getFirstLowQualityPos(qualS, minQual);
            if (firstBadBase < (maxBarcodeLength + 2 * chunkSize)) {
                return null;
            }
        }
        int miss = -1;
        if (fastq) {
            miss = seqS.lastIndexOf('N', maxBarcodeLength + 2*chunkSize - 1);
        } else {
            miss = seqS.lastIndexOf('.', maxBarcodeLength + 2*chunkSize - 1);
        }
        if (miss != -1) {
            return null;  //bad sequence so skip
        }
        Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
        if (bestBarcode == null) {
            return null;  //overhang missing so skip
        }
        String genomicSeq = seqS.substring(bestBarcode.barLength, seqS.length());
        ReadBarcodeResult tagProcessingResults = removeSeqAfterSecondCutSite(genomicSeq, (byte) (2 * chunkSize));
        String hap = tagProcessingResults.paddedSequence;  //this is slow 20% of total time.   Tag, cut site processed, padded with poly-A

        read = BaseEncoder.getLongArrayFromSeq(hap);
        int pos = tagProcessingResults.length;
        //TODO this instantiation should also include the orginal unprocessedSequence, processedSequence, and paddedSequence - the the object encode it 

        ReadBarcodeResult rbr = new ReadBarcodeResult(read, (byte) pos, bestBarcode.getTaxaName(),bestBarcode.taxaIndex);
        return rbr;
    }
    
   /**Returns the number of barcodes for the flowcell and lane*/
    public int getBarCodeCount() {
        return theBarcodes.length;
    }

    /**Returns the {@link Barcode} for the flowcell and lane*/
    public Barcode getTheBarcodes(int index) {
        return theBarcodes[index];
    }

    /**Returns the taxaNames for the flowcell and lane*/
    public String[] getTaxaNames() {
        String[] result = new String[getBarCodeCount()];
        for (int i = 0; i < result.length; i++) {
            result[i] = getTheBarcodes(i).getTaxaName();
        }
        return result;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy