net.maizegenetics.analysis.gbs.v2.ProductionSNPCallerPluginV2 Maven / Gradle / Ivy
/*
* ProductionSNPCallerPlugin
*/
package net.maizegenetics.analysis.gbs.v2;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.LongAdder;
import javax.swing.ImageIcon;
import net.maizegenetics.analysis.gbs.Barcode;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.snp.Allele;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.SimpleAllele;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.dna.snp.genotypecall.BasicGenotypeMergeRule;
import net.maizegenetics.dna.snp.genotypecall.GenotypeMergeRule;
import net.maizegenetics.dna.tag.Tag;
import net.maizegenetics.dna.tag.TagBuilder;
import net.maizegenetics.dna.tag.TagData;
import net.maizegenetics.dna.tag.TagDataSQLite;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.log4j.Logger;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
/**
* This plugin converts all of the fastq (and/or qseq) files in the input folder
* and keyfile to genotypes and adds these to a genotype file in HDF5 format.
*
* We refer to this step as the "Production Pipeline".
*
* The output format is either HDF5 or VCF genotypes with allelic depth stored.
* Output file type is determined by presence of the ".h5" suffix. SNP calling is
* quantitative with the option of using either the Glaubitz/Buckler binomial
* method (pHet/pErr > 1 = het) (=default), or the Stacks method.
*
* Merging of samples with the same LibraryPrepID is handled by
* GenotypeTableBuilder.addTaxon(), with the genotypes re-called based upon the
* new depths. Therefore, if you want to keep adding genotypes to the same
* target HDF5 file in subsequent runs, use the -ko (keep open) option so that
* the output GenotypeTableBuilder will be mutable, using closeUnfinished()
* rather than build().
*
* If the target output is HDF5, and that GenotypeTable file doesn't exist, it will be
* created.
*
* Each taxon in the output file is named "ShortName:LibraryPrepID" and is
* annotated with "Flowcell_Lanes" (=source seq data for current genotype).
*
* Requires a database with variants added from a previous "Discovery Pipeline" run.
*
* References to "tag" are being replaced by references to "kmer" as the pipeline
* is really a kmer alignment process.
*
* TODO add the Stacks likelihood method to BasicGenotypeMergeRule
*
* @author Ed Buckler
* @author Jeff Glaubitz
*/
public class ProductionSNPCallerPluginV2 extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(ProductionSNPCallerPluginV2.class);
private PluginParameter myInputDir = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true).inDir()
.description("Input directory containing fastq AND/OR qseq files.").build();
private PluginParameter myKeyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
.description("Key file listing barcodes distinguishing the samples").build();
private PluginParameter myEnzyme = new PluginParameter.Builder<>("e", null, String.class).guiName("Enzyme").required(true)
.description("Enzyme used to create the GBS library").build();
private PluginParameter myInputDB = new PluginParameter.Builder<>("db", null, String.class).guiName("Input GBS Database").required(true).inFile()
.description("Input Database file if using SQLite").build();
private PluginParameter myOutputGenotypes = new PluginParameter.Builder<>("o", null, String.class).guiName("Output Genotypes File").required(true).outFile()
.description("Output (target) genotypes file to produce. Default output file type is VCF. If file suffix is .h5, an hdf5 file will be created instead.")
.build();
private PluginParameter myAveSeqErrorRate = new PluginParameter.Builder<>("eR", 0.01, Double.class).guiName("Ave Seq Error Rate")
.description("Average sequencing error rate per base (used to decide between heterozygous and homozygous calls)").build();
private PluginParameter myMaxDivergence = new PluginParameter.Builder<>("d", 0, Integer.class).guiName("Max Divergence")
.description("Maximum divergence (edit distance) between new read and previously mapped read (Default: 0 = perfect matches only)").build();
private PluginParameter myKeepGenotypesOpen = new PluginParameter.Builder<>("ko", false, Boolean.class).guiName("Keep Genotypes Open")
.description("Only applicable to hdf5 output files: Keep hdf5 genotypes open for future runs that add more taxa or more depth").build();
private PluginParameter myDepthOutput = new PluginParameter.Builder<>("do", true, Boolean.class).guiName("Write Depths to Output")
.description("Depth output: True means write depths to the output hdf5 genotypes file, false means do NOT write depths to the hdf5 file").build();
private PluginParameter myKmerLength = new PluginParameter.Builder<>("kmerLength", 64, Integer.class).guiName("Maximum Kmer Length")
.description("Length of kmers to process").build();
private PluginParameter posQualityScore = new PluginParameter.Builder<>("minPosQS", 0.0, Double.class).guiName("Minimun snp quality score")
.description("Minimum quality score for snp position to be included").build();
private PluginParameter myBatchSize = new PluginParameter.Builder<>("batchSize", 8, Integer.class).guiName("Batch size of fastq files").required(false)
.description("Number of flow cells being processed simultaneously").build();
private PluginParameter myMinQualScore = new PluginParameter.Builder<>("mnQS", 0, Integer.class).guiName("Minimum quality score").required(false)
.description("Minimum quality score within the barcode and read length to be accepted").build();
//private PluginParameter myStacksLikelihood = new PluginParameter.Builder<>("sL", false, Boolean.class).guiName("Use Stacks Likelihood")
// .description("Use STACKS likelihood method to call heterozygotes (default: use tasselGBS likelihood ratio method)").build();
private String myOutputDir = null;
private static boolean isHDF5 = false; // default is VCF
private TagData tagDataReader = null;
Multimap tagCntMap=Multimaps.synchronizedMultimap(ArrayListMultimap.create(384, 500_000));
private Set seqFilesInKeyAndDir = new TreeSet<>(); // fastq (or qseq) file names present in input directory that have a "Flowcell_Lane" in the key file
protected static int readEndCutSiteRemnantLength;
private Trie ahoCorasickTrie; // import from ahocorasick-0.2.1.jar
private String[] likelyReadEndStrings;
//Documentation of read depth per sample (one recorded per replicate)
// Treemap is synchronized as multiple threads may increment values.
private Map rawReadCountsMap = new TreeMap<>();
private Map rawReadCountsForFullSampleName = Collections.synchronizedMap(rawReadCountsMap);
private Map matchedReadCountsMap = new TreeMap<>();
private Map matchedReadCountsForFullSampleName = Collections.synchronizedMap(matchedReadCountsMap);
private GenotypeMergeRule genoMergeRule = null;
private boolean taglenException;
public ProductionSNPCallerPluginV2() {
super(null, false);
}
public ProductionSNPCallerPluginV2(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public void postProcessParameters() {
try {
myOutputDir = (new File(outputGenotypesFile())).getCanonicalFile().getParent();
} catch (IOException e) {
throw new IllegalStateException("Problem resolving output directory:" + e);
}
genoMergeRule = new BasicGenotypeMergeRule(aveSeqErrorRate());
if (!myOutputGenotypes.isEmpty()) {
if (outputGenotypesFile().endsWith(".h5")) {
isHDF5 = true;
}
}
if (!myEnzyme.isEmpty()) {
// Add likelyReadEnds to the ahoCorasick trie
EnzymeList.Enzyme enzyme = EnzymeList.defaultCache.getEnzyme(enzyme());
likelyReadEndStrings = enzyme.likelyReadEnd; // for removeSecondCutSiteIndexOf()
readEndCutSiteRemnantLength = enzyme.readEndCutSiteRemnantLength;
// // the junit test runs about a second faster average 15.5 vs 16.5) without Trie().removeOverlaps();
// String[] likelyReadEnd = enzyme.likelyReadEnd();
// ahoCorasickTrie = new Trie(); // adding caseInsensitive causes nothing to be found
// for (String readEnd: likelyReadEnd) {
// ahoCorasickTrie.addKeyword(readEnd);
// }
//
// // Bug in acorasick code that occurs when multiple threads
// // call parseText at once. The software computes the failure
// // states the first time "parseText" is called. If multiple threads
// // hit this at once, they will all call "constructFailureStates"
// // and this causes problems. Suggested workaround is to call
// // parseText once after all keywords are added, and before we enter
// // multi-threaded mode. This gets the failure states constructed
// // before we start processing the fastQ reads. See this link:
// // https://github.com/robert-bor/aho-corasick/issues/16
// ahoCorasickTrie.parseText("CAGCTTCAGGTGGTGCGGACGTGGTGGATCACGGCGCTGAAGCCCGCGCGCTTGGTCTGGCTGA");
}
}
@Override
public DataSet processData(DataSet input) {
int batchSize = batchSize();
Path keyPath= Paths.get(keyFile()).toAbsolutePath();
List directoryFiles= DirectoryCrawler.listPaths(GBSUtils.inputFileGlob, Paths.get(myInputDir.value()).toAbsolutePath());
if(directoryFiles.isEmpty()) {
myLogger.warn("No files matching:"+GBSUtils.inputFileGlob);
return null;
}
List inputSeqFiles = GBSUtils.culledFiles(directoryFiles,keyPath);
if (inputSeqFiles.size() == 0) return null; // no files to process
tagDataReader =new TagDataSQLite(myInputDB.value());
TaxaList masterTaxaList= TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), GBSUtils.sampleNameField, new HashMap<>(), true);
writeInitialTaxaReadCounts(masterTaxaList); // initialize synchronized maps
//todo perhaps subset the masterTaxaList based on the files in there, but it seems like it will all be figure out.
Map canonicalTag=new HashMap<>(); //canonicalize them OR eventually we will use a Trie
tagDataReader.getTags().stream().forEach(t -> canonicalTag.put(t,t));
int batchNum = inputSeqFiles.size()/batchSize;
if (inputSeqFiles.size() % batchSize !=0) batchNum++;
System.out.println("ProductionSNPCallerPluginV2: Total batches to process: " + batchNum);
final PositionList positionList=tagDataReader.getSNPPositions(positionQualityScore());
if (positionList == null || positionList.size() == 0) {
String errMsg = "\nNo snp positons found with quality score of " + positionQualityScore() + ".\n"
+ "Please run UpdateSNPPositionQualityPlugin to add quality scores for your positions,\n"
+ " then select snp positions within a quality range you have specified.\n";
myLogger.error(errMsg);
return null;
}
GenotypeTableBuilder gtb=setUpGenotypeTableBuilder(outputGenotypesFile(), positionList, genoMergeRule);
final Multimap tagsToIndex=ArrayListMultimap.create();
tagDataReader.getAlleleMap().entries().stream()
.forEach(e -> {
// indexOf returns -1 if the list doesn't contain the element, which it won't
// if there are snpposition entries with a quality score less than minimumQualityScore
int posIndex=positionList.indexOf(e.getValue().position());
if (posIndex >= 0) {
tagsToIndex.put(e.getKey(),new AlleleWithPosIndex(e.getValue(),posIndex));
}
});
taglenException = false;
for (int idx = 0; idx < inputSeqFiles.size(); idx+=batchSize) {
tagCntMap.clear(); // start fresh with each new batch
int end = idx+batchSize;
if (end > inputSeqFiles.size()) end = inputSeqFiles.size();
ArrayList sub = new ArrayList();
for (int jdx = idx; jdx < end; jdx++) sub.add(inputSeqFiles.get(jdx));
System.out.println("\nStart processing batch " + String.valueOf(idx/batchSize+1));
sub.parallelStream()
.forEach(inputSeqFile -> {
try {
processFastQFile(masterTaxaList,keyPath, inputSeqFile, enzyme(),canonicalTag,kmerLength(), minimumQualityScore());
} catch (StringIndexOutOfBoundsException oobe) {
oobe.printStackTrace();
myLogger.error(oobe.getMessage());
setTagLenException();
return;
}
});
if (taglenException == true) return null; // Tag length failure from processFastQ - halt processing
tagCntMap.asMap().entrySet().stream()
.forEach(e -> {
callGenotypes(e.getKey(), e.getValue(), tagsToIndex, positionList, genoMergeRule,gtb,depthToOutput());
//System.out.println(e.x.getName()+ Arrays.toString(Arrays.copyOfRange(e.y,0,10))));
});
System.out.println("\nFinished processing batch " + String.valueOf(idx/batchSize+1));
}
if (isHDF5) { // build hdf5 output
if (keepGenotypesOpen()) {
gtb.closeUnfinished();
} else {
gtb.build();
}
} else { // VCF output
// Build genotype
GenotypeTable myGt = gtb.build();
ExportUtils.writeToVCF(myGt, outputGenotypesFile(), depthToOutput());
}
writeReadsPerSampleReports(positionList.size());
return null;
}
private static void callGenotypes(Taxon taxon, Collection tags, Multimap tagsToIndex,
PositionList positionList, GenotypeMergeRule genoMergeRule, GenotypeTableBuilder gtb, boolean outputDepths) {
int[][] alleleDepths = new int[NucleotideAlignmentConstants.NUMBER_NUCLEOTIDE_ALLELES][positionList.numberOfSites()];
tags.stream().map(t -> tagsToIndex.get(t)).flatMap(c -> c.stream())
.forEach(a -> alleleDepths[a.allele()][a.positionIndex()]++);
if (outputDepths) {
byte[][] byteDepths = AlleleDepthUtil.depthIntToByte(alleleDepths);
gtb.addTaxon(taxon, resolveGenosForTaxon(alleleDepths, genoMergeRule),byteDepths);
} else {
gtb.addTaxon(taxon, resolveGenosForTaxon(alleleDepths, genoMergeRule));
}
}
private class AlleleWithPosIndex extends SimpleAllele {
private int positionIndex;
private AlleleWithPosIndex(Allele myAllele, int positionIndex) {
super(myAllele.allele(), myAllele.position());
this.positionIndex=positionIndex;
}
public int positionIndex() {
return positionIndex;
}
}
private class CountOfReadQuality {
LongAdder allReads=new LongAdder();
LongAdder goodBarcodedReads=new LongAdder();
LongAdder goodMatched=new LongAdder();
LongAdder perfectMatches=new LongAdder();
LongAdder imperfectMatches=new LongAdder();
LongAdder singleImperfectMatches=new LongAdder();
}
private void processFastQFile(TaxaList masterTaxaList, Path keyPath, Path fastQPath, String enzymeName,
Map canonicalTags, int preferredTagLength, int minQual) throws StringIndexOutOfBoundsException{
ArrayList tl=GBSUtils.getLaneAnnotatedTaxaList(keyPath, fastQPath);
BarcodeTrie barcodeTrie=GBSUtils.initializeBarcodeTrie(tl, masterTaxaList, EnzymeList.defaultCache.getEnzyme(enzymeName));
try {
processFastQ(fastQPath,barcodeTrie,canonicalTags,preferredTagLength, minQual);
} catch (StringIndexOutOfBoundsException oobe) {
throw oobe; // let processData() handle it
}
}
private void processFastQ(Path fastqFile, BarcodeTrie barcodeTrie, Map canonicalTags,
int preferredTagLength, int minQual) throws StringIndexOutOfBoundsException {
int allReads=0, goodBarcodedReads = 0, lowQualityReads = 0;
try {
int qualityScoreBase=GBSUtils.determineQualityScoreBase(fastqFile);
BufferedReader br = Utils.getBufferedReader(fastqFile.toString(), 1 << 22);
long time=System.nanoTime();
String[] seqAndQual;
while ((seqAndQual=GBSUtils.readFastQBlock(br, allReads)) != null) {
allReads++;
// Decode barcode using the current sequence & quality score
Barcode barcode=barcodeTrie.longestPrefix(seqAndQual[0]);
if(barcode==null) continue;
if(minQual>0) {
//todo move getFirstLowQualityPos into this class?
if(BaseEncoder.getFirstLowQualityPos(seqAndQual[1],minQual, qualityScoreBase)<(barcode.getBarLength()+preferredTagLength)){
lowQualityReads++;
continue;
}
}
rawReadCountsForFullSampleName.put(barcode.getTaxaName(), rawReadCountsForFullSampleName.get(barcode.getTaxaName()) + 1);
int barcodeLen = barcode.getBarLength();
if (seqAndQual[0].length() - barcodeLen < preferredTagLength) {
String errMsg = "\n\nERROR processing " + fastqFile.toString() + "\n" +
"Reading entry number " + allReads + " fails the length test.\n" +
"Sequence length " + seqAndQual[0].length() + " minus barcode length "+ barcodeLen +
" is less than kmerLength " + preferredTagLength + ".\n" +
"Re-run your files with either a shorter kmerLength value or a higher minimum quality score.\n";
throw new StringIndexOutOfBoundsException(errMsg);
}
Tag tag = removeSecondCutSiteIndexOf(seqAndQual[0].substring(barcodeLen),preferredTagLength);
//Tag tag = removeSecondCutSiteAhoC(seqAndQual[0].substring(barcodeLen),preferredTagLength);
//Tag tag= TagBuilder.instance(seqAndQual[0].substring(barcode.getBarLength(), barcode.getBarLength() + preferredTagLength)).build();
if(tag==null) continue; //null occurs when any base was not A, C, G, T
goodBarcodedReads++;
Tag canonicalTag=canonicalTags.get(tag);
if(canonicalTag!=null) {
tagCntMap.put(barcode.getTaxon(),canonicalTag);
matchedReadCountsForFullSampleName.put(barcode.getTaxaName(), matchedReadCountsForFullSampleName.get(barcode.getTaxaName()) + 1);
}
if (allReads % 1000000 == 0) {
myLogger.info("Total Reads:" + allReads + " Reads with barcode and cut site overhang:" + goodBarcodedReads
+ " rate:" + (System.nanoTime()-time)/allReads +" ns/read");
}
}
myLogger.info("Total number of reads in lane=" + allReads);
myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
myLogger.info("Total number of low quality reads=" + lowQualityReads);
myLogger.info("Timing process (sorting, collapsing, and writing TagCount to file).");
myLogger.info("Process took " + (System.nanoTime() - time)/1e6 + " milliseconds for file " + fastqFile.toString());
br.close();
} catch (Exception e) {
myLogger.error("Good Barcodes Read: " + goodBarcodedReads);
e.printStackTrace();
}
}
// Using indexOf() is much faster than using Aho-C trie to find
// the second cut site. (junits ran on average of 15.81281 seconds
// for aho-c after 5 runs, vs 13.53204 for 5 runs using indexOf method)
private Tag removeSecondCutSiteIndexOf(String seq, int preferredLength) {
Tag tag = null;
// handle overlapping cutsite for ApeKI enzyme
if (enzyme().equalsIgnoreCase("ApeKI")) {
if (seq.startsWith("CAGCTGC") || seq.startsWith("CTGCAGC")) {
seq = seq.substring(3,seq.length());
}
}
int indexOfReadEnd = -1;
String shortSeq = seq.substring(20);
for (String readEnd: likelyReadEndStrings){
int indx = shortSeq.indexOf(readEnd);
if (indx > 0 ) {
if (indexOfReadEnd < 0 || indx < indexOfReadEnd) {
indexOfReadEnd = indx;
}
}
}
if (indexOfReadEnd > 0 &&
(indexOfReadEnd + 20 + readEndCutSiteRemnantLength < preferredLength)) {
// trim tag to sequence up to & including the cut site
tag = TagBuilder.instance(seq.substring(0, indexOfReadEnd + 20 + readEndCutSiteRemnantLength)).build();
} else {
int seqEnd = (byte) Math.min(seq.length(), preferredLength);
return TagBuilder.instance(seq.substring(0,seqEnd)).build();
}
return tag;
}
private Tag removeSecondCutSiteAhoC(String seq, int preferredLength) {
// Removes the second cut site BEFORE we trim the tag.
// this preserves the cut site incase it shows up in the middle
Tag tag = null;
int cutSiteIndex = 9999;
// handle overlapping cutsite for ApeKI enzyme
if (enzyme().equalsIgnoreCase("ApeKI")) {
if (seq.startsWith("CAGCTGC") || seq.startsWith("CTGCAGC")) {
seq = seq.substring(3,seq.length());
}
}
// use ahoCorasickTrie to find cut site
// Start at seq+20 since 20 is default minimum length
Collection emits = null;
try {
emits = ahoCorasickTrie.parseText(seq.substring(20));
} catch (Exception emitsEx) {
System.out.println("ahoCorasick excep: seq: " + seq);
emitsEx.printStackTrace();
// proceed as if no tag was found. need to understand why sometimes
// aho-c complains of a null pointer
int seqEnd = (byte) Math.min(seq.length(), preferredLength);
return TagBuilder.instance(seq.substring(0,seqEnd)).build();
}
// Using the above as debug, when we find more than 1, the items in the list
// appear in the order they appear in the seq. So we only need to look at the first one
for (Emit emit: emits) {
cutSiteIndex = emit.getStart();
break;
}
if (cutSiteIndex + 20 + readEndCutSiteRemnantLength < preferredLength) { // add 20 as we cut off first 20 for aho-c parsing
// trim tag to sequence up to & including the cut site
tag = TagBuilder.instance(seq.substring(0, cutSiteIndex + 20 + readEndCutSiteRemnantLength)).build();
} else {
// if no cut site found, or it is beyond preferred length
int seqEnd = (byte) Math.min(seq.length(), preferredLength);
tag = TagBuilder.instance(seq.substring(0,seqEnd)).build();
}
return tag;
}
private void reportProgress(int[] counters, long readSeqReadTime, long ifRRNotNullTime) {
myLogger.info(
"totalReads:" + counters[0]
+ " goodBarcodedReads:" + counters[1]
+ " goodMatchedToTOPM:" + counters[2]
// + " perfectMatches:" + counters[3]
// + " nearMatches:" + counters[4]
// + " uniqueNearMatches:" + counters[5]
+ " cumulReadSequenceTime: " + ((double) (readSeqReadTime) / 1_000_000_000.0) + " sec"
+ " cumulProcessSequenceTime: " + ((double) (ifRRNotNullTime) / 1_000_000_000.0) + " sec"
);
}
private void reportTotals(Path fileName, int[] counters, int nFilesProcessed) {
myLogger.info("Total number of reads in lane=" + counters[0]);
myLogger.info("Total number of good, barcoded reads=" + counters[1]);
myLogger.info("Total number of good, barcoded reads matched to the TOPM=" + counters[2]);
myLogger.info("Finished reading " + nFilesProcessed + " of " + seqFilesInKeyAndDir.size() + " sequence files: " + fileName + "\n");
}
private static GenotypeTableBuilder setUpGenotypeTableBuilder(String anOutputFile, PositionList positionList, GenotypeMergeRule mergeRule) {
if (isHDF5) {
File hdf5File = new File(anOutputFile);
if (hdf5File.exists()) {
myLogger.info("\nGenotypes will be added to existing HDF5 file:\n " + anOutputFile + "\n");
return GenotypeTableBuilder.mergeTaxaIncremental(anOutputFile, mergeRule);
} else {
myLogger.info("\nThe target HDF5 file:\n " + anOutputFile
+ "\ndoes not exist. A new HDF5 file of that name will be created \nto hold the genotypes from this run.");
return GenotypeTableBuilder.getTaxaIncrementalWithMerging(anOutputFile, positionList, mergeRule);
}
} else { // create genotype table for VCF
GenotypeTableBuilder gtb = GenotypeTableBuilder.getTaxaIncremental(positionList, mergeRule);
myLogger.info("\nOutput VCF file: \n" + anOutputFile +
" \ncreated for genotypes from this run.");
return gtb;
}
}
private static byte[] resolveGenosForTaxon(int[][] depthsForTaxon, GenotypeMergeRule genoMergeRule) {
int nAlleles = depthsForTaxon.length;
int[] depthsAtSite = new int[nAlleles];
int nSites = depthsForTaxon[0].length;
byte[] genos = new byte[nSites];
for (int site = 0; site < nSites; site++) {
for (int allele = 0; allele < nAlleles; allele++) {
depthsAtSite[allele] = depthsForTaxon[allele][site];
}
genos[site] = genoMergeRule.callBasedOnDepth(depthsAtSite);
}
return genos;
}
private void writeReadsPerSampleReports(int tagsProcessed) {
myLogger.info("\nWriting ReadsPerSample log file...");
String outFileS = myOutputDir + File.separator + (new File(keyFile())).getName();
outFileS = outFileS.replaceAll(".txt", "_ReadsPerSample.log");
outFileS = outFileS.replaceAll("_key", "");
try {
String msg = "ReadsPerSample log file: " + outFileS;
myLogger.info(msg);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outFileS))), 65536);
bw.write("FullSampleName\t\t\tgoodBarcodedReads\tgoodReadsMatchedToDataBase\n");
for (String fullSampleName : rawReadCountsForFullSampleName.keySet()) {
bw.write(fullSampleName + "\t" + rawReadCountsForFullSampleName.get(fullSampleName) + "\t\t" + matchedReadCountsForFullSampleName.get(fullSampleName) + "\n");
}
bw.close();
} catch (Exception e) {
myLogger.error("Couldn't write to ReadsPerSample log file: " + e);
e.printStackTrace();
System.exit(1);
}
myLogger.info("\n\nTotal number of SNPs processed with minimum quality score " + minimumQualityScore() + " was " + tagsProcessed + ".\n");
myLogger.info(" ...done\n");
}
private void writeInitialTaxaReadCounts(TaxaList tl) {
tl.stream() // Add initial taxa names with count of 0 to synchronized maps
.forEach(taxon -> {
rawReadCountsForFullSampleName.put(taxon.getName(), 0);
matchedReadCountsForFullSampleName.put(taxon.getName(), 0);
});
}
public void setTagLenException() {
taglenException = true;
}
private void printFileNameConventions(String actualFileName) {
String message
= "\n\n"
+ "Error in parsing file name:"
+ "\n The raw sequence filename does not contain either 3, 4, or 5 underscore-delimited values."
+ "\n Acceptable file naming conventions include the following (where FLOWCELL indicates the flowcell name and LANE is an integer):"
+ "\n FLOWCELL_LANE_fastq.gz"
+ "\n FLOWCELL_s_LANE_fastq.gz"
+ "\n code_FLOWCELL_s_LANE_fastq.gz"
+ "\n FLOWCELL_LANE_fastq.txt.gz"
+ "\n FLOWCELL_s_LANE_fastq.txt.gz"
+ "\n code_FLOWCELL_s_LANE_fastq.txt.gz"
+ "\n FLOWCELL_LANE_qseq.txt.gz"
+ "\n FLOWCELL_s_LANE_qseq.txt.gz"
+ "\n code_FLOWCELL_s_LANE_qseq.txt.gz"
+ "\n"
+ "\n Actual Filename: " + actualFileName
+ "\n\n";
myLogger.error(message);
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Production SNP Caller";
}
@Override
public String getToolTipText() {
return "Production SNP Caller";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(ProductionSNPCallerPluginV2.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
public TagData runPlugin(DataSet input) {
return (TagData) performFunction(input).getData(0).getData();
}
/**
* Input directory containing fastq AND/OR qseq files.
*
* @return Input Directory
*/
public String inputDirectory() {
return myInputDir.value();
}
/**
* Set Input Directory. Input directory containing fastq
* AND/OR qseq files.
*
* @param value Input Directory
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 inputDirectory(String value) {
myInputDir = new PluginParameter<>(myInputDir, value);
return this;
}
/**
* Key file listing barcodes distinguishing the samples
*
* @return Key File
*/
public String keyFile() {
return myKeyFile.value();
}
/**
* Set Key File. Key file listing barcodes distinguishing
* the samples
*
* @param value Key File
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 keyFile(String value) {
myKeyFile = new PluginParameter<>(myKeyFile, value);
return this;
}
/**
* Enzyme used to create the GBS library
*
* @return Enzyme
*/
public String enzyme() {
return myEnzyme.value();
}
/**
* Set Enzyme. Enzyme used to create the GBS library
*
* @param value Enzyme
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 enzyme(String value) {
myEnzyme = new PluginParameter<>(myEnzyme, value);
return this;
}
/**
* Input Database file if using SQLite
*
* @return Input GBS Database
*/
public String inputGBSDatabase() {
return myInputDB.value();
}
/**
* Set Input GBS Database. Input Database file if using
* SQLite
*
* @param value Input GBS Database
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 inputGBSDatabase(String value) {
myInputDB = new PluginParameter<>(myInputDB, value);
return this;
}
/**
* Output (target) genotypes file to add new genotypes
* to (new file created if it doesn't exist)
*
* @return Output file Genotypes File
*/
public String outputGenotypesFile() {
return myOutputGenotypes.value();
}
/**
* Set Output Genotypes File. Output (target)
* genotypes file to add new genotypes to (new file created
* if it doesn't exist)
*
* @param value Output Genotypes File
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 outputGenotypesFile(String value) {
myOutputGenotypes = new PluginParameter<>(myOutputGenotypes, value);
return this;
}
/**
* Average sequencing error rate per base (used to decide
* between heterozygous and homozygous calls)
*
* @return Ave Seq Error Rate
*/
public Double aveSeqErrorRate() {
return myAveSeqErrorRate.value();
}
/**
* Set Ave Seq Error Rate. Average sequencing error rate
* per base (used to decide between heterozygous and homozygous
* calls)
*
* @param value Ave Seq Error Rate
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 aveSeqErrorRate(Double value) {
myAveSeqErrorRate = new PluginParameter<>(myAveSeqErrorRate, value);
return this;
}
/**
* Maximum divergence (edit distance) between new read
* and previously mapped read (Default: 0 = perfect matches
* only)
*
* @return Max Divergence
*/
public Integer maxDivergence() {
return myMaxDivergence.value();
}
/**
* Set Max Divergence. Maximum divergence (edit distance)
* between new read and previously mapped read (Default:
* 0 = perfect matches only)
*
* @param value Max Divergence
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 maxDivergence(Integer value) {
myMaxDivergence = new PluginParameter<>(myMaxDivergence, value);
return this;
}
/**
* Keep hdf5 genotypes open for future runs that add more
* taxa or more depth
*
* @return Keep Genotypes Open
*/
public Boolean keepGenotypesOpen() {
return myKeepGenotypesOpen.value();
}
/**
* Set Keep Genotypes Open. Keep hdf5 genotypes open for
* future runs that add more taxa or more depth
*
* @param value Keep Genotypes Open
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 keepGenotypesOpen(Boolean value) {
myKeepGenotypesOpen = new PluginParameter<>(myKeepGenotypesOpen, value);
return this;
}
/**
* Output depth: write depths to the output
* hdf5 genotypes file
*
* @return Depth to Output - true or false
*/
public Boolean depthToOutput() {
return myDepthOutput.value();
}
/**
* User sets true or false, indicating if they do
* or do not want depth information written to the
* HDF5 file.
*
* @param value Write depth to output file
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 depthToOutput(Boolean value) {
myDepthOutput = new PluginParameter<>(myDepthOutput, value);
return this;
}
/**
* Maximum Tag Length
*
* @return Maximum Tag Length
*/
public Integer kmerLength() {
return myKmerLength.value();
}
/**
* Set Maximum Tag Length: User should set this value
* equivalent to what was used in GBSSeqToTagDBPlugin
* for maximum tag length when creating the database.
* If the two values are not equal inconsistent results
* may occur.
*
* @param value Maximum Tag Length
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 kmerLength(Integer value) {
myKmerLength = new PluginParameter<>(myKmerLength, value);
return this;
}
/**
* Minimum Position Quality Score
*
* @return Minimum position quality score
*/
public Double positionQualityScore() {
return posQualityScore.value();
}
/**
* Set Minimum quality score for position: This value is used to pull
* SNPs out of the snpposition table. Only snps with quality
* scores meeting or exceeding the specified value will be
* processed.
*
* @param value Minimum position quality score
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 positionQualityScore(Double value) {
posQualityScore = new PluginParameter<>(posQualityScore, value);
return this;
}
/**
* Batch size for processing fastq files
*
* @return batchSize
*/
public Integer batchSize() {
return myBatchSize.value();
}
/**
* Set number of Fastq files processed simultaneously
* @param value
* @return
*/
public ProductionSNPCallerPluginV2 batchSize(Integer value) {
myBatchSize = new PluginParameter<>(myBatchSize, value);
return this;
}
/**
* Minimum quality score within the barcode and read length
* to be accepted
*
* @return Minimum quality score
*/
public Integer minimumQualityScore() {
return myMinQualScore.value();
}
/**
* Set Minimum quality score. Minimum quality score within
* the barcode and read length to be accepted
*
* @param value Minimum quality score
*
* @return this plugin
*/
public ProductionSNPCallerPluginV2 minimumQualityScore(Integer value) {
myMinQualScore = new PluginParameter<>(myMinQualScore, value);
return this;
}
/**
* Use STACKS likelihood method to call heterozygotes (default: use
* tasselGBS likelihood ratio method)
*
* @return Use Stacks Likelihood
*/
//public Boolean useStacksLikelihood() {
// return myStacksLikelihood.value();
//}
/**
* Set Use Stacks Likelihood. Use STACKS likelihood method to call
* heterozygotes (default: use tasselGBS likelihood ratio method)
*
* @param value Use Stacks Likelihood
*
* @return this plugin
*/
//public ProductionSNPCallerPlugin useStacksLikelihood(Boolean value) {
// myStacksLikelihood = new PluginParameter<>(myStacksLikelihood, value);
// return this;
//}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy