Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package net.maizegenetics.analysis.gbs;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import net.maizegenetics.dna.BaseEncoder;
/**
* Takes a key file and then sets up the methods to decode a read from the sequencer.
* The key file decribes how barcodes are related to their taxon. Generally, a keyfile
* with all flowcells is included, and then the flowcell and lane to be processed are
* indicated in the constructor.
*
* @author Ed Buckler, Jeff Glaubitz, and James Harriman
*
*/
public class ParseBarcodeRead {
private static int chunkSize = BaseEncoder.chunkSize;
protected int maximumMismatchInBarcodeAndOverhang = 0;
protected static String[] initialCutSiteRemnant = null;
protected static int readEndCutSiteRemnantLength;
static String nullS = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
protected static String[] likelyReadEnd = null;
protected static String theEnzyme = null;
static int maxBarcodeLength = 10;
private Barcode[] theBarcodes;
private long[] quickBarcodeList;
private HashMap quickMap;
/**
* Create the barcode parsing object
* @param keyFile file location for the keyfile
* @param enzyme name of the enzyme
* @param flowcell name of the flowcell to be processed
* @param lane name of the lane to be processed
*/
public ParseBarcodeRead(String keyFile, String enzyme, String flowcell, String lane) {
if (enzyme != null) {
chooseEnzyme(enzyme);
} else {
chooseEnzyme(getKeyFileEnzyme(keyFile));
}
int totalBarcodes = setupBarcodeFiles(new File(keyFile), flowcell, lane);
System.out.println("Total barcodes found in lane:" + totalBarcodes);
}
/**
* Determines which cut sites to look for, and sets them, based on the
* enzyme used to generate the GBS library. For two-enzyme GBS both enzymes
* MUST be specified and separated by a dash "-". e.g. PstI-MspI, SbfI-MspI
*
* @param enzyme The name of the enzyme (case insensitive)
*/
//TODO these should all be private static final globals, then just use this set which one is active.
public static void chooseEnzyme(String enzyme) {
// Check for case-insensitive (?i) match to a known enzyme
// The common adapter is: [readEndCutSiteRemnant]AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
if (enzyme.matches("(?i)apek[i1]")) {
theEnzyme = "ApeKI";
initialCutSiteRemnant = new String[]{"CAGC", "CTGC"};
likelyReadEnd = new String[]{"GCAGC", "GCTGC", "GCAGAGAT", "GCTGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)pst[i1]")) {
theEnzyme = "PstI";
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"CTGCAG", "CTGCAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)ecot22[i1]")) {
theEnzyme = "EcoT22I";
initialCutSiteRemnant = new String[]{"TGCAT"};
likelyReadEnd = new String[]{"ATGCAT", "ATGCAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)pas[i1]")) {
theEnzyme = "PasI";
initialCutSiteRemnant = new String[]{"CAGGG", "CTGGG"};
likelyReadEnd = new String[]{"CCCAGGG", "CCCTGGG", "CCCTGAGAT", "CCCAGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)hpaii|(?i)hpa2")) {
theEnzyme = "HpaII";
initialCutSiteRemnant = new String[]{"CGG"};
likelyReadEnd = new String[]{"CCGG", "CCGAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)msp[i1]")) {
theEnzyme = "MspI"; // MspI and HpaII are isoschizomers (same recognition seq and overhang)
initialCutSiteRemnant = new String[]{"CGG"};
likelyReadEnd = new String[]{"CCGG", "CCGAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)pst[i1]-apek[i1]")) {
theEnzyme = "PstI-ApeKI";
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"GCAGC", "GCTGC", "CTGCAG", "GCAGAGAT", "GCTGAGAT"}; // look for ApeKI site, PstI site, or common adapter for ApeKI
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)pst[i1]-bfa[i1]")) {
theEnzyme = "PstI-BfaI"; // PstI: CTGCA^G BfaI: C^TAG
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"CTGCAG", "CTAG", "CTAAGATC"}; // look for PstI site, BfaI site, or common adapter start for BfaI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)pst[i1]-ecot22[i1]")) {
theEnzyme = "PstI-EcoT22I";
initialCutSiteRemnant = new String[]{"TGCAG", "TGCAT"};
likelyReadEnd = new String[]{"ATGCAT", "CTGCAG", "CTGCAAGAT", "ATGCAAGAT"}; // look for EcoT22I site, PstI site, or common adapter for PstI/EcoT22I
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)pst[i1]-msp[i1]")) {
theEnzyme = "PstI-MspI";
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"CCGG", "CTGCAG", "CCGAGATC"}; // look for MspI site, PstI site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)pst[i1]-msp[i1]-GDFcustom")) {
theEnzyme = "PstI-MspI-GDFcustom";
initialCutSiteRemnant = new String[]{"TGCAG"};
// changed from CCGAGAT to CCGCTCAGG, as IGD/GDF used a custom Y adapter for MspI
likelyReadEnd = new String[]{"CCGG", "CTGCAG", "CCGCTCAGG"}; // look for MspI site, PstI site, or GDF custom common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)pst[i1]-taq[i1]")) {
theEnzyme = "PstI-TaqI";
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"TCGA", "CTGCAG", "TCGAGATC"}; // look for TaqI site, PstI site, or common adapter for TaqI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)nsi[i1]-msp[i1]")) {
theEnzyme = "NsiI-MspI"; // ATGCA^T C^CGG
initialCutSiteRemnant = new String[]{"TGCAT"};
likelyReadEnd = new String[]{"CCGG", "ATGCAT", "CCGAGATC"}; // look for MspI site, NsiI site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)PaeR7[i1]-Hha[i1]")) {
theEnzyme = "PaeR7I-HhaI";
// Requested by Ye, Songqing, use same Y adapter as Polland paper -QS
initialCutSiteRemnant=new String[]{"TCGAG"};
likelyReadEnd = new String[]{"GCGC", "CTCGAG", "GCGAGATC"}; // look for HhaI site, PaeR7I site, or common adapter for HhaI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)sbf[i1]-msp[i1]")) {
theEnzyme = "SbfI-MspI"; // CCTGCA^GG C^CGG
initialCutSiteRemnant = new String[]{"TGCAGG"};
likelyReadEnd = new String[]{"CCGG", "CCTGCAGG", "CCGAGATC"}; // look for MspI site, SbfI site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)sbf[i1]-hpaii|(?i)sbf[i1]-hpa2")) {
theEnzyme = "SbfI-HpaII"; // CCTGCA^GG C^CGG Nb: HpaII is an isoschizomer of MspI
initialCutSiteRemnant = new String[]{"TGCAGG"};
likelyReadEnd = new String[]{"CCGG", "CCTGCAGG", "CCGAGATC"}; // look for HpaII site, SbfI site, or common adapter for HpaII
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)sbf[i1]-bfa[i1]")) {
theEnzyme = "SbfI-BfaI"; // CCTGCA^GG C^TAG
initialCutSiteRemnant = new String[]{"TGCAGG"};
likelyReadEnd = new String[]{"CTAG", "CCTGCAGG", "CTAAGATC"}; // look for BfaI site, SbfI site, or common adapter for BfaI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)sph[i1]-ecor[i1]")) {
theEnzyme = "SphI-EcoRI"; // GCATG^C G^AATTC
initialCutSiteRemnant = new String[]{"CATGC"}; // SphI overhang from the genomic DNA fragment (top strand)
likelyReadEnd = new String[]{"GCATGC", "GAATTC", "GAATTAGATC"}; // look for SphI site, EcoRI site, or common adapter for EcoRI
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)asis[i1]-msp[i1]")) {
theEnzyme = "AsiSI-MspI";
initialCutSiteRemnant = new String[]{"ATCGC"};
likelyReadEnd = new String[]{"CCGG", "GCGATCGC", "CCGAGATC"}; // look for MspI site, AsiSI site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)bsshii-msp[i1]|(?i)bssh2-msp[i1]")) {
theEnzyme = "BssHII-MspI";
initialCutSiteRemnant = new String[]{"CGCGC"};
likelyReadEnd = new String[]{"CCGG", "GCGCGC", "CCGAGATC"}; // look for MspI site, BssHII site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)fse[i1]-msp[i1]")) {
theEnzyme = "FseI-MspI";
initialCutSiteRemnant = new String[]{"CCGGCC"};
likelyReadEnd = new String[]{"CCGG", "GGCCGGCC", "CCGAGATC"}; // look for MspI site, FseI site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)sal[i1]-msp[i1]")) {
theEnzyme = "SalI-MspI";
initialCutSiteRemnant = new String[]{"TCGAC"};
likelyReadEnd = new String[]{"CCGG", "GTCGAC", "CCGAGATC"}; // look for MspI site, SalI site, or common adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)ecor[i1]-msp[i1]")) {
theEnzyme = "EcoRI-MspI"; // G^AATTC C^CGG
initialCutSiteRemnant = new String[]{"AATTC"};
likelyReadEnd = new String[]{"CCGG", "GAATTC", "CCGAGATC"}; // look for MspI site, EcoRI site, or Poland et al. 2012 Y-adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)hindiii-msp[i1]|(?i)hind3-msp[i1]")) {
theEnzyme = "HindIII-MspI"; // A^AGCTT C^CGG
initialCutSiteRemnant = new String[]{"AGCTT"};
likelyReadEnd = new String[]{"CCGG", "AAGCTT", "CCGAGATC"}; // look for MspI site, HindIII site, or Poland et al. 2012 Y-adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)hindiii-nlaiii|(?i)hind3-nla3")) {
theEnzyme = "HindIII-NlaIII"; // A^AGCTT ^CATG (not blunt)
initialCutSiteRemnant = new String[]{"AGCTT"};
likelyReadEnd = new String[]{"CATG", "AAGCTT", "CATGAGATC"}; // look for NlaIII site, HindIII site, or Poland et al. 2012 Y-adapter for Nla3
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)sexa[i1]-sau3a[i1]")) {
theEnzyme = "SexAI-Sau3AI"; // A^CCWGGT ^GATC (not blunt)
initialCutSiteRemnant = new String[]{"CCAGGT", "CCTGGT"};
likelyReadEnd = new String[]{"GATC", "ACCAGGT", "ACCTGGT", "GATCAGATC"}; // look for SexAI site, Sau3AI site, or Poland et al. 2012 Y-adapter for Sau3AI
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)bamh[i1l]-mluc[i1]")) {
theEnzyme = "BamHI-MluCI"; // G^GATCC ^AATT (not blunt)
initialCutSiteRemnant = new String[]{"GATCC"};
likelyReadEnd = new String[]{"AATT", "GGATCC", "AATTAGATC"}; // look for MluCI site, BamHI site, or Poland et al. 2012 Y-adapter for MluCI
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)pst[i1]-mluc[i1]")) {
theEnzyme = "PstI-MluCI"; // CTGCA^G ^AATT (not blunt)
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"AATT", "CTGCAG", "AATTAGATC"}; // look for MluCI site, PstI site, or Poland et al. 2012 Y-adapter for MluCI
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)psti-msei|(?i)pst1-mse1")) {
theEnzyme = "PstI-MseI"; // CTGCA^G T^TAA
initialCutSiteRemnant = new String[]{"TGCAG"};
likelyReadEnd = new String[]{"TTAA", "CTGCAG", "TTAAGATC"}; // look for MseI site, PstI site, or Poland et al. 2012 Y-adapter for MseI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)avaii-msei|(?i)ava2-mse1")) {
theEnzyme = "AvaII-MseI"; // G^GWCC T^TAA W=AorT
initialCutSiteRemnant = new String[]{"GACC", "GTCC"};
likelyReadEnd = new String[]{"TTAA", "GGACC", "GGTCC", "TTAAGATC"}; // look for MseI site, AvaII site, or Poland et al. 2012 Y-adapter for MseI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)ecori-msei|(?i)ecor1-mse1")) {
theEnzyme = "EcoRI-MseI"; // G^AATTC T^TAA
initialCutSiteRemnant = new String[]{"AATTC"};
likelyReadEnd = new String[]{"TTAA", "GAATTC", "TTAAGATC"}; // look for MseI site, EcoRI site, or Poland et al. 2012 Y-adapter for MseI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)ecori-avaii|(?i)ecor1-ava2")) {
theEnzyme = "EcoRI-AvaII"; // G^AATTC G^GWCC
initialCutSiteRemnant = new String[]{"AATTC"};
likelyReadEnd = new String[]{"GGACC", "GGTCC", "GAATTC", "GGACAGATC", "GGTCAGATC"}; // look for AvaII site, EcoRI site, or Poland et al. 2012 Y-adapter for AvaII
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)ecori-hinfi|(?i)ecor1-hinf1")) {
theEnzyme = "EcoRI-HinfI"; // G^AATTC G^ANTC
initialCutSiteRemnant = new String[]{"AATTC"};
likelyReadEnd = new String[]{"GAATC", "GACTC", "GAGTC", "GATTC", "GAATTC", "GAATAGATC", "GACTAGATC", "GAGTAGATC", "GATTAGATC"}; // look for HinfI site, EcoRI site, or Poland et al. 2012 Y-adapter for HinfI
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)bbvci-mspi|(?i)bbvc1-msp1")) {
theEnzyme = "BbvCI-MspI"; // CCTCAGC (-5/-2) C^CGG
initialCutSiteRemnant = new String[]{"TCAGC"};
likelyReadEnd = new String[]{"CCGG", "CCTCAGC", "CCGAGATC"}; // look for MspI site, BbvCI site, or Poland et al. 2012 Y-adapter for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)msp[i1]-apek[i1]")) {
theEnzyme = "MspI-ApeKI"; // C^CGG G^CWGC
initialCutSiteRemnant = new String[]{"CGG", "CAGC", "CTGC"};
likelyReadEnd = new String[]{"CCGG", "GCAGC", "GCTGC", "CCGAGATCGG", "GCAGAGAT", "GCTGAGAT"};
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)apo[i1]")){
theEnzyme = "ApoI";
initialCutSiteRemnant=new String[]{"AATTC","AATTT"};
likelyReadEnd = new String[]{"AAATTC","AAATTT","GAATTC","GAATTT","AAATTAGAT","GAATTAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)BamH[i1l]")) {
theEnzyme = "BamHI";
initialCutSiteRemnant = new String[]{"GATCC"};
likelyReadEnd = new String[]{"GGATCC", "GGATCAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
// likelyReadEnd = new String[]{"GGATCC", "AGATCGGAA", "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG"}; // <-- corrected from this by Jeff Glaubitz on 2012/09/12
readEndCutSiteRemnantLength = 5;
} else if (enzyme.matches("(?i)mse[i1]")) {
theEnzyme = "MseI";
initialCutSiteRemnant = new String[]{"TAA"};
likelyReadEnd = new String[]{"TTAA", "TTAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)Sau3A[i1]")){
theEnzyme = "Sau3AI";
initialCutSiteRemnant=new String[]{"GATC"};
likelyReadEnd = new String[]{"GATC","GATCAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 4;
} else if(enzyme.matches("(?i)nde[i1]")){
theEnzyme = "NdeI";
initialCutSiteRemnant=new String[]{"TATG"};
likelyReadEnd = new String[]{"CATATG","CATAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 4;
} else if(enzyme.matches("(?i)hinp1[i1]")){
theEnzyme = "HinP1I";
initialCutSiteRemnant=new String[]{"CGC"};
likelyReadEnd = new String[]{"GCGC","GCGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)sbf[i1]")){
theEnzyme = "SbfI";
initialCutSiteRemnant=new String[]{"TGCAGG"};
likelyReadEnd = new String[]{"CCTGCAGG","CCTGCAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 6;
} else if (enzyme.matches("(?i)hindiii|(?i)hind3")) {
theEnzyme = "HindIII"; // A^AGCTT
initialCutSiteRemnant=new String[]{"AGCTT"};
likelyReadEnd = new String[]{"AAGCTT","AAGCTAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if(enzyme.matches("(?i)ecor[i1]")) {
theEnzyme = "EcoRI"; // G^AATTC
initialCutSiteRemnant= new String[]{"AATTC"};
likelyReadEnd = new String[]{"GAATTC","GAATTAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if(enzyme.matches("(?i)cviq[i1]")){
theEnzyme = "CviQI"; // CviQI and Csp6I are isoschizomers (same recognition seq and overhang)
initialCutSiteRemnant=new String[]{"TAC"};
likelyReadEnd = new String[]{"GTAC","GTAAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)csp6[i1]")){
theEnzyme = "Csp6I"; // Csp6I and CviQI are isoschizomers (same recognition seq and overhang)
initialCutSiteRemnant=new String[]{"TAC"};
likelyReadEnd = new String[]{"GTAC","GTAAGATCGG"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)nlaiii|(?i)nla3")) {
theEnzyme = "NlaIII"; // CATG^
initialCutSiteRemnant=new String[]{"CATG"};
likelyReadEnd = new String[]{"CATG","CATGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 4;
} else if(enzyme.matches("(?i)sph[i1]")){
theEnzyme = "SphI"; // GCATG^C
initialCutSiteRemnant=new String[]{"CATGC"};
likelyReadEnd = new String[]{"GCATGC","GCATGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if(enzyme.matches("(?i)nsp[i1]")){
theEnzyme = "NspI"; // RCATG^Y
initialCutSiteRemnant=new String[]{"CATGC","CATGT"};
likelyReadEnd = new String[]{"ACATGT","GCATGC","ACATGAGAT","GCATGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if(enzyme.matches("(?i)kpn[i1]")){
theEnzyme = "KpnI"; // GGTAC^C
initialCutSiteRemnant=new String[]{"GTACC"};
likelyReadEnd = new String[]{"GGTACC","GGTACAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if(enzyme.matches("(?i)sty[i1]")){
theEnzyme = "StyI"; // C^CWWGG
initialCutSiteRemnant=new String[]{"CAAGG","CATGG","CTAGG","CTTGG"};
likelyReadEnd = new String[]{"CCAAGG","CCATGG","CCTAGG","CCTTGG","CCAAGAGAT","CCATGAGAT","CCTAGAGAT","CCTTGAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 5;
} else if(enzyme.matches("(?i)styi-msei|(?i)sty1-mse1")){
theEnzyme = "StyI-MseI"; // C^CWWGG & T^TAA
initialCutSiteRemnant=new String[]{"CAAGG","CATGG","CTAGG","CTTGG"};
likelyReadEnd = new String[]{"TTAA","CCAAGG","CCATGG","CCTAGG","CCTTGG","TTAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)fse[i1]")){
theEnzyme = "FseI"; // GGCCGG^CC
initialCutSiteRemnant=new String[]{"CCGGCC"};
likelyReadEnd = new String[]{"GGCCGGCC","AGATCGGAAG"}; // full cut site (from partial digest or chimera) or Morishige et al (BMC Genomics, 2013) T adapter start
readEndCutSiteRemnantLength = 0; // assumes that common T adapter is far more likely than a second full cut site
} else if(enzyme.matches("(?i)NgoMIV|(?i)NgoM4")){
theEnzyme = "NgoMIV"; // G^CCGGC
initialCutSiteRemnant=new String[]{"CCGGC"};
likelyReadEnd = new String[]{"GCCGGC","AGATCGGAAG"}; // full cut site (from partial digest or chimera) or Morishige et al (BMC Genomics, 2013) T adapter start
readEndCutSiteRemnantLength = 0; // assumes that common T adapter is far more likely than a second full cut site
} else if(enzyme.matches("(?i)msl[i1]")){
theEnzyme = "MslI"; // CAYNN^NNRTG -- has 32 different cut sites (assuming constrained to palindromic YNN -- 32^2 otherwise)
initialCutSiteRemnant=new String[]{""};
likelyReadEnd = new String[]{"AGATCGGA"}; // common adapter start only (too many possible cut sites!)
readEndCutSiteRemnantLength = 0;
} else if(enzyme.matches("(?i)ase[i1]")){
theEnzyme = "AseI"; // AT^TAAT
initialCutSiteRemnant=new String[]{"TAAT"};
likelyReadEnd = new String[]{"ATTAAT","ATTAAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 4;
} else if(enzyme.matches("(?i)avaii|(?i)ava2")){
theEnzyme = "AvaII"; // G^GWCC
initialCutSiteRemnant=new String[]{"GACC","GTCC"};
likelyReadEnd = new String[]{"GGACC","GGTCC","GGACAGAT","GGTCAGAT"}; // full cut site (from partial digest or chimera) or common adapter start
readEndCutSiteRemnantLength = 4;
} else if (enzyme.matches("(?i)kpn[i1]-msp[i1]")) {
theEnzyme = "KpnI-MspI"; // KpnI: GGTAC^C MspI: C^CGG
initialCutSiteRemnant = new String[]{"GTACC"};
likelyReadEnd = new String[]{"CCGG", "GGTACC", "CCGAGATC"}; // look for MspI site, KpnI site, or common adapter start for MspI
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)RBSTA")) {
theEnzyme = "RBSTA";
initialCutSiteRemnant = new String[]{"TA"};
likelyReadEnd = new String[]{"TTAA", "GTAC", "CTAG", "TTAAGAT", "GTAAGAT", "CTAAGAT"}; // full cut site (from partial digest or chimera) of MseI, CVIQi, XspI or common adapter start
readEndCutSiteRemnantLength = 3;
} else if (enzyme.matches("(?i)RBSCG")) {
theEnzyme = "RBSCG";
initialCutSiteRemnant = new String[]{"CG"};
likelyReadEnd = new String[]{"CCGC", "TCGA", "GCGC", "CCGG", "ACGT", "CCGAGAT", "TCGAGAT", "GCGAGAT", "ACGAGAT"}; // full cut site (from partial digest or chimera) of AciI, TaqaI, HinpI, HpaII, HpyCH4IV or common adapter start
readEndCutSiteRemnantLength = 3;
} else if(enzyme.matches("(?i)ignore")){
theEnzyme = "unspecified"; // can be used for new enzymes -- only looks for barcodes and common adapter starts
initialCutSiteRemnant=new String[]{""};
likelyReadEnd = new String[]{"AGATCGGA"}; // common adapter start only
readEndCutSiteRemnantLength = 0;
} else {
System.out.println("The software didn't recognize your restriction enzyme (-e option).\n"
+"Currently, only the following enzymes are recognized for single enzyme digests:\n"
+" ApeKI" +"\n"
+" ApoI" +"\n"
+" AseI" +"\n"
+" AvaII" +"\n"
+" BamHI" +"\n"
+" Csp6I" +"\n"
+" CviQI" +"\n"
+" EcoRI" +"\n"
+" EcoT22I" +"\n"
+" FseI" +"\n"
+" HindIII" +"\n"
+" HinP1I" +"\n"
+" HpaII" +"\n"
+" KpnI" +"\n"
+" MseI" +"\n"
+" MslI" +"\n"
+" MspI" +"\n"
+" NdeI" +"\n"
+" NgoMIV" +"\n"
+" NlaIII" +"\n"
+" NspI" +"\n"
+" PasI" +"\n"
+" PstI" +"\n"
+" Sau3AI" +"\n"
+" SbfI" +"\n"
+" SphI" +"\n"
+" StyI" +"\n"
+" RBSTA" +"\n"
+" RBSCG" +"\n"
+" ignore" +"\n"
+"Or the following for two-enzyme digests:\n"
+" AsiSI-MspI" +"\n"
+" AvaII-MseI" +"\n"
+" BamHI-MluCI" +"\n"
+" BbvCI-MspI" +"\n"
+" BssHII-MspI" +"\n"
+" EcoRI-AvaII" +"\n"
+" EcoRI-HinfI" +"\n"
+" EcoRI-MseI" +"\n"
+" EcoRI-MspI" +"\n"
+" FseI-MspI" +"\n"
+" HindIII-MspI" +"\n"
+" HindIII-NlaIII" +"\n"
+" KpnI-MspI" +"\n"
+" MspI-ApeKI" +"\n"
+" NsiI-MspI" +"\n"
+" PaeR7I-HhaI" +"\n"
+" PstI-ApeKI" +"\n"
+" PstI-BfaI" +"\n"
+" PstI-EcoT22I" +"\n"
+" PstI-MluCI" +"\n"
+" PstI-MseI" +"\n"
+" PstI-MspI" +"\n"
+" PstI-MspI-GDFcustom"+"\n"
+" PstI-TaqI" +"\n"
+" SbfI-BfaI" +"\n"
+" SbfI-HpaII" +"\n"
+" SalI-MspI" +"\n"
+" SbfI-MspI" +"\n"
+" SexAI-Sau3AI" +"\n"
+" SphI-EcoRI" +"\n"
+" StyI-MseI" +"\n"
+" ignore" +"\n"
);
System.out.println("For two-enzyme digest, enzyme names should be separated by a dash, e.g. -e PstI-MspI");
System.out.println("\nIf your enzyme is not on the above list you can use \"-e ignore\". In this case\n"
+" barcodes and common adapter start sequences will be recognized, but chimeric DNA\n"
+" fragments (or partial digests) will not be trimmed.");
}
System.out.println("Enzyme: " + theEnzyme);
}
private String getKeyFileEnzyme(String keyFileName) {
String result = null;
try {
BufferedReader br = new BufferedReader(new FileReader(keyFileName), 65536);
String temp;
int currLine = 0;
while (((temp = br.readLine()) != null)) {
String[] s = temp.split("\\t"); //split by whitespace
if (currLine > 0) {
String enzymeName;
if (s.length < 9) {
enzymeName = "";
} else {
enzymeName = s[8];
}
if (!enzymeName.equals("")) {
result = enzymeName;
break;
}
}
currLine++;
}
} catch (Exception e) {
System.out.println("Couldn't open key file to read Enzyme: " + e);
}
return result;
}
/**
* Reads in an Illumina key file, creates a linear array of {@link Barcode} objects
* representing the barcodes in the key file, then creates a hash map containing
* indices from the linear array indexed by sequence. The names of barcode objects
* follow the pattern samplename:flowcell:lane:LibraryPrepID, since sample names alone are not unique.
*
* @param keyFile Illumina key file.
* @param flowcell Only barcodes from this flowcell will be added to the array.
* @param lane Only barcodes from this lane will be added to the array.
* @return Number of barcodes in the array.
*/
private int setupBarcodeFiles(File keyFile, String flowcell, String lane) {
try {
BufferedReader br = new BufferedReader(new FileReader(keyFile), 65536);
ArrayList theBarcodesArrayList = new ArrayList();
String temp;
while (((temp = br.readLine()) != null)) {
String[] s = temp.split("\\t"); //split by whitespace
Barcode theBC = null;
if (s[0].equals(flowcell) && s[1].equals(lane)) {
if (s.length < 8 || s[7] == null || s[7].trim().equals("")) { // column H (LibraryPrepID) is required for TASSEL5
throw new IllegalArgumentException("Column H of the key file must contain a LibraryPrepID (integer or alphanumeric)");
} else { // use the "libraryPrepID" or whatever is in column H of the key file
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3]+":"+s[0]+":"+s[1]+":"+ s[7],-1,flowcell,lane);
}
theBarcodesArrayList.add(theBC);
System.out.println(theBC.barcodeS + " " + theBC.taxaName);
}
}
theBarcodes = new Barcode[theBarcodesArrayList.size()];
theBarcodesArrayList.toArray(theBarcodes);
Arrays.sort(theBarcodes);
int nBL = theBarcodes[0].barOverLong.length;
quickBarcodeList = new long[theBarcodes.length * nBL];
quickMap = new HashMap();
for (int i = 0; i < theBarcodes.length; i++) {
for (int j = 0; j < nBL; j++) {
quickBarcodeList[i * nBL + j] = theBarcodes[i].barOverLong[j];
quickMap.put(theBarcodes[i].barOverLong[j], i);
}
}
Arrays.sort(quickBarcodeList);
} catch (Exception e) {
System.out.println("Error with setupBarcodeFiles: " + e);
}
return theBarcodes.length;
}
/**
* Returns the best barcode match for a given sequence.
* @param queryS query sequence to be tested against all barcodes
* @param maxDivergence maximum divergence to permit
* @return best barcode match (null if no good match)
*/
Barcode findBestBarcode(String queryS, int maxDivergence) {
long query = BaseEncoder.getLongFromSeq(queryS.substring(0, chunkSize));
//note because the barcodes are polyA after the sequence, they should always
//sort ahead of the hit, this is the reason for the -(closestHit+2)
int closestHit = Arrays.binarySearch(quickBarcodeList, query);
/* THIS IS THE NEW PIPELINE APPROACH THAT DOES NOT WORK
if(closestHit>-2) return null; //hit or perfect
if((query&quickBarcodeList[-(closestHit+2)])!=quickBarcodeList[-(closestHit+2)]) return null;
int index =quickMap.get(quickBarcodeList[-(closestHit+2)]);
// System.out.println(theBarcodes[index].barcodeS);
return theBarcodes[index];
//note to see if it is a perfect match you can just bit AND
*/
// Below is the old pipeline approach, which works (at least for maxDivergence of 0)
if (closestHit < -1) { // should always be true, as the barcode+overhang is padded to 32 bases with polyA
int index = quickMap.get(quickBarcodeList[-(closestHit + 2)]);
if (theBarcodes[index].compareSequence(query, 1) == 0) {
return theBarcodes[index];
} else if (maxDivergence == 0) {
return null; // return null if not a perfect match
}
} else {
return null; // should never go to this line
}
int maxLength = 0, minDiv = maxDivergence + 1, countBest = 0;
Barcode bestBC = null;
for (Barcode bc : theBarcodes) {
int div = bc.compareSequence(query, maxDivergence + 1);
if (div <= minDiv) {
if ((div < minDiv) || (bc.barOverLength > maxLength)) {
minDiv = div;
maxLength = bc.barOverLength;
bestBC = bc;
countBest = 1;
} else { //it is a tie, so return that not resolvable
bestBC = null;
countBest++;
}
}
}
return bestBC;
}
/**
* The barcode libraries used for this study can include two types of
* extraneous sequence at the end of reads. The first are chimeras created
* with the free ends. These will recreate the restriction site. The second
* are short regions (less than 64bp), so that will they will contain a
* portion of site and the universal adapter. This finds the first of site
* in likelyReadEnd, keeps the restriction site overhang and then sets
* everything to polyA afterwards
*
* @param seq An unprocessed tag sequence.
* @param maxLength The maximum number of bp in the processed sequence.
* @return returnValue A ReadBarcodeResult object containing the unprocessed
* tag, Cut site position, Processed tag, and Poly-A padded tag.
*/
public static ReadBarcodeResult removeSeqAfterSecondCutSite(String seq, byte maxLength) {
//this looks for a second restriction site or the common adapter start, and then turns the remaining sequence to AAAA
int cutSitePosition = 9999;
ReadBarcodeResult returnValue = new ReadBarcodeResult(seq);
//Look for cut sites, starting at a point past the length of the initial cut site remnant that all reads begin with
String match = null;
for (String potentialCutSite : likelyReadEnd) {
int p = seq.indexOf(potentialCutSite, 1);
if ((p > 1) && (p < cutSitePosition)) {
cutSitePosition = p;
match = potentialCutSite;
}
}
if (theEnzyme.equalsIgnoreCase("ApeKI") && cutSitePosition == 2
&& (match.equalsIgnoreCase("GCAGC") || match.equalsIgnoreCase("GCTGC"))) { // overlapping ApeKI cut site: GCWGCWGC
seq = seq.substring(3, seq.length()); // trim off the initial GCW from GCWGCWGC
cutSitePosition = 9999;
returnValue.unprocessedSequence = seq;
for (String potentialCutSite : likelyReadEnd) {
int p = seq.indexOf(potentialCutSite, 1);
if ((p > 1) && (p < cutSitePosition)) {
cutSitePosition = p;
}
}
}
if (cutSitePosition < maxLength) { // Cut site found
//Trim tag to sequence up to & including the cut site
returnValue.length = (byte) (cutSitePosition + readEndCutSiteRemnantLength);
returnValue.processedSequence = seq.substring(0, cutSitePosition + readEndCutSiteRemnantLength);
} else {
if (seq.length() <= 0) {
//If cut site is missing because there is no sequence
returnValue.processedSequence = "";
returnValue.length = 0;
} else {
//If cut site is missing because it is beyond the end of the sequence (or not present at all)
returnValue.length = (byte) Math.min(seq.length(), maxLength);
returnValue.processedSequence = (seq.substring(0, returnValue.length));
}
}
//Pad sequences shorter than max. length with A
if (returnValue.length < maxLength) {
returnValue.paddedSequence = returnValue.processedSequence + nullS;
returnValue.paddedSequence = returnValue.paddedSequence.substring(0, maxLength);
} else {
//Truncate sequences longer than max. length
returnValue.paddedSequence = returnValue.processedSequence.substring(0, maxLength);
returnValue.length = maxLength;
}
return returnValue;
}
/**
* Return a {@link ReadBarcodeResult} that captures the processed read and taxa
* inferred by the barcode
* @param seqS DNA sequence from the sequencer
* @param qualS quality score string from the sequencer
* @param fastq (fastq = true?; qseq=false?)
* @param minQual minimum quality score
* @return If barcode and cut site was found returns the result and
* processed sequence, if the barcode and cut site were not found return
* null
*/
public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, boolean fastq, int minQual) {
long[] read = new long[2];
if ((minQual > 0) && (qualS != null)) {
int firstBadBase = BaseEncoder.getFirstLowQualityPos(qualS, minQual);
if (firstBadBase < (maxBarcodeLength + 2 * chunkSize)) {
return null;
}
}
int miss = -1;
if (fastq) {
miss = seqS.lastIndexOf('N', maxBarcodeLength + 2*chunkSize - 1);
} else {
miss = seqS.lastIndexOf('.', maxBarcodeLength + 2*chunkSize - 1);
}
if (miss != -1) {
return null; //bad sequence so skip
}
Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
if (bestBarcode == null) {
return null; //overhang missing so skip
}
String genomicSeq = seqS.substring(bestBarcode.barLength, seqS.length());
ReadBarcodeResult tagProcessingResults = removeSeqAfterSecondCutSite(genomicSeq, (byte) (2 * chunkSize));
String hap = tagProcessingResults.paddedSequence; //this is slow 20% of total time. Tag, cut site processed, padded with poly-A
read = BaseEncoder.getLongArrayFromSeq(hap);
int pos = tagProcessingResults.length;
//TODO this instantiation should also include the orginal unprocessedSequence, processedSequence, and paddedSequence - the the object encode it
ReadBarcodeResult rbr = new ReadBarcodeResult(read, (byte) pos, bestBarcode.getTaxaName(),bestBarcode.taxaIndex);
return rbr;
}
/**Returns the number of barcodes for the flowcell and lane*/
public int getBarCodeCount() {
return theBarcodes.length;
}
/**Returns the {@link Barcode} for the flowcell and lane*/
public Barcode getTheBarcodes(int index) {
return theBarcodes[index];
}
/**Returns the taxaNames for the flowcell and lane*/
public String[] getTaxaNames() {
String[] result = new String[getBarCodeCount()];
for (int i = 0; i < result.length; i++) {
result[i] = getTheBarcodes(i).getTaxaName();
}
return result;
}
}