All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.v2.ProductionSNPCallerPluginV2 Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
/*
 * ProductionSNPCallerPlugin
 */
package net.maizegenetics.analysis.gbs.v2;

import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.LongAdder;

import javax.swing.ImageIcon;

import net.maizegenetics.analysis.gbs.Barcode;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.snp.Allele;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.SimpleAllele;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.dna.snp.genotypecall.BasicGenotypeMergeRule;
import net.maizegenetics.dna.snp.genotypecall.GenotypeMergeRule;
import net.maizegenetics.dna.tag.Tag;
import net.maizegenetics.dna.tag.TagBuilder;
import net.maizegenetics.dna.tag.TagData;
import net.maizegenetics.dna.tag.TagDataSQLite;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;

import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;

/**
 * This plugin converts all of the fastq (and/or qseq) files in the input folder
 * and keyfile to genotypes and adds these to a genotype file in HDF5 format.
 *
 * We refer to this step as the "Production Pipeline".
 *
 * The output format is either HDF5 or VCF genotypes with allelic depth stored. 
 * Output file type is determined by presence of the ".h5" suffix.  SNP calling is
 * quantitative with the option of using either the Glaubitz/Buckler binomial
 * method (pHet/pErr > 1 = het) (=default), or the Stacks method.
 *
 * Merging of samples with the same LibraryPrepID is handled by
 * GenotypeTableBuilder.addTaxon(), with the genotypes re-called based upon the
 * new depths. Therefore, if you want to keep adding genotypes to the same
 * target HDF5 file in subsequent runs, use the -ko (keep open) option so that
 * the output GenotypeTableBuilder will be mutable, using closeUnfinished()
 * rather than build().
 *
 * If the target output is HDF5, and that GenotypeTable file doesn't exist, it will be
 * created.  
 *
 * Each taxon in the output file is named "ShortName:LibraryPrepID" and is
 * annotated with "Flowcell_Lanes" (=source seq data for current genotype).
 *
 * Requires a database with variants added from a previous "Discovery Pipeline" run.
 * 
 * References to "tag" are being replaced by references to "kmer" as the pipeline
 * is really a kmer alignment process.
 *
 * TODO add the Stacks likelihood method to BasicGenotypeMergeRule
 *
 * @author Ed Buckler
 * @author Jeff Glaubitz
 */
public class ProductionSNPCallerPluginV2 extends AbstractPlugin {

    private static final Logger myLogger = LogManager.getLogger(ProductionSNPCallerPluginV2.class);

    private PluginParameter myInputDir = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true).inDir()
            .description("Input directory containing fastq AND/OR qseq files.").build();
    private PluginParameter myKeyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
            .description("Key file listing barcodes distinguishing the samples").build();
    private PluginParameter myEnzyme = new PluginParameter.Builder<>("e", null, String.class).guiName("Enzyme").required(true)
            .description("Enzyme used to create the GBS library").build();
    private PluginParameter myInputDB = new PluginParameter.Builder<>("db", null, String.class).guiName("Input GBS Database").required(true).inFile()
            .description("Input Database file if using SQLite").build();
    private PluginParameter myOutputGenotypes = new PluginParameter.Builder<>("o", null, String.class).guiName("Output Genotypes File").required(true).outFile()
            .description("Output (target) genotypes file to produce.  Default output file type is VCF.  If file suffix is .h5, an hdf5 file will be created instead.")
            .build();
    private PluginParameter myAveSeqErrorRate = new PluginParameter.Builder<>("eR", 0.01, Double.class).guiName("Ave Seq Error Rate")
            .description("Average sequencing error rate per base (used to decide between heterozygous and homozygous calls)").build();
    private PluginParameter myMaxDivergence = new PluginParameter.Builder<>("d", 0, Integer.class).guiName("Max Divergence")
            .description("Maximum divergence (edit distance) between new read and previously mapped read (Default: 0 = perfect matches only)").build();
    private PluginParameter myKeepGenotypesOpen = new PluginParameter.Builder<>("ko", false, Boolean.class).guiName("Keep Genotypes Open")
            .description("Only applicable to hdf5 output files: Keep hdf5 genotypes open for future runs that add more taxa or more depth").build();
    private PluginParameter myDepthOutput = new PluginParameter.Builder<>("do", true, Boolean.class).guiName("Write Depths to Output")
            .description("Depth output: True means write depths to the output hdf5 genotypes file, false means do NOT write depths to the hdf5 file").build();
    private PluginParameter myKmerLength = new PluginParameter.Builder<>("kmerLength", 64, Integer.class).guiName("Maximum Kmer Length")
            .description("Length of kmers to process").build();
    private PluginParameter posQualityScore = new PluginParameter.Builder<>("minPosQS", 0.0, Double.class).guiName("Minimun snp quality score")
            .description("Minimum quality score for snp position to be included").build();
    private PluginParameter myBatchSize = new PluginParameter.Builder<>("batchSize", 8, Integer.class).guiName("Batch size of fastq files").required(false)
            .description("Number of flow cells being processed simultaneously").build();
    private PluginParameter myMinQualScore = new PluginParameter.Builder<>("mnQS", 0, Integer.class).guiName("Minimum quality score").required(false)
            .description("Minimum quality score within the barcode and read length to be accepted").build();
    //private PluginParameter myStacksLikelihood = new PluginParameter.Builder<>("sL", false, Boolean.class).guiName("Use Stacks Likelihood")
    //        .description("Use STACKS likelihood method to call heterozygotes (default: use tasselGBS likelihood ratio method)").build();

    private String myOutputDir = null;
    private static boolean isHDF5 = false; // default is VCF
    private TagData tagDataReader = null;
    Multimap tagCntMap=Multimaps.synchronizedMultimap(ArrayListMultimap.create(384, 500_000));
    private Set seqFilesInKeyAndDir = new TreeSet<>(); // fastq (or qseq) file names present in input directory that have a "Flowcell_Lane" in the key file
 
    protected static int readEndCutSiteRemnantLength;
    private Trie ahoCorasickTrie; // import from ahocorasick-0.2.1.jar
    private String[] likelyReadEndStrings;

    //Documentation of read depth per sample (one recorded per replicate)
    // Treemap is synchronized as multiple threads may increment values.
    private Map rawReadCountsMap = new TreeMap<>();
    private Map rawReadCountsForFullSampleName = Collections.synchronizedMap(rawReadCountsMap);
    private Map matchedReadCountsMap = new TreeMap<>();
    private Map matchedReadCountsForFullSampleName = Collections.synchronizedMap(matchedReadCountsMap);

    private GenotypeMergeRule genoMergeRule = null;
    private boolean taglenException;
    
    public ProductionSNPCallerPluginV2() {
        super(null, false);
    }

    public ProductionSNPCallerPluginV2(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public void postProcessParameters() {
        try {
            myOutputDir = (new File(outputGenotypesFile())).getCanonicalFile().getParent();
        } catch (IOException e) {
            throw new IllegalStateException("Problem resolving output directory:" + e);
        }
        genoMergeRule = new BasicGenotypeMergeRule(aveSeqErrorRate());
        
        if (!myOutputGenotypes.isEmpty()) {
            if (outputGenotypesFile().endsWith(".h5")) {
                isHDF5 = true;
            }
        }
        if (!myEnzyme.isEmpty()) {
            // Add likelyReadEnds to the ahoCorasick trie
            EnzymeList.Enzyme enzyme = EnzymeList.defaultCache.getEnzyme(enzyme()); 
 
            likelyReadEndStrings = enzyme.likelyReadEnd; // for removeSecondCutSiteIndexOf()
            readEndCutSiteRemnantLength = enzyme.readEndCutSiteRemnantLength;
//            // the junit test runs about a second faster average 15.5 vs 16.5) without Trie().removeOverlaps();
//            String[] likelyReadEnd = enzyme.likelyReadEnd();
//            ahoCorasickTrie = new Trie(); // adding caseInsensitive causes nothing to be found
//            for (String readEnd: likelyReadEnd) {
//                ahoCorasickTrie.addKeyword(readEnd);
//            }  
//
//            // Bug in acorasick code that occurs when multiple threads
//            // call parseText at once.  The software computes the failure
//            // states the first time "parseText" is called.  If multiple threads
//            // hit this at once, they will all call "constructFailureStates" 
//            // and this causes problems.  Suggested workaround is to call
//            // parseText once after all keywords are added, and before we enter
//            // multi-threaded mode.  This gets the failure states constructed
//            // before we start processing the fastQ reads.  See this link:
//            //   https://github.com/robert-bor/aho-corasick/issues/16
//            ahoCorasickTrie.parseText("CAGCTTCAGGTGGTGCGGACGTGGTGGATCACGGCGCTGAAGCCCGCGCGCTTGGTCTGGCTGA");
        }
    }

    @Override
    public DataSet processData(DataSet input) {
        int batchSize = batchSize();
        Path keyPath= Paths.get(keyFile()).toAbsolutePath();
        List directoryFiles= DirectoryCrawler.listPaths(GBSUtils.inputFileGlob, Paths.get(myInputDir.value()).toAbsolutePath());
        if(directoryFiles.isEmpty()) {
            myLogger.warn("No files matching:"+GBSUtils.inputFileGlob);
            return null;
        }
        List inputSeqFiles = GBSUtils.culledFiles(directoryFiles,keyPath);
        if (inputSeqFiles.size() == 0) return null; // no files to process

        tagDataReader =new TagDataSQLite(myInputDB.value());
        TaxaList masterTaxaList= TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), GBSUtils.sampleNameField, new HashMap<>(), true);
        writeInitialTaxaReadCounts(masterTaxaList); // initialize synchronized maps
        //todo perhaps subset the masterTaxaList based on the files in there, but it seems like it will all be figure out.
        Map canonicalTag=new HashMap<>();  //canonicalize them OR eventually we will use a Trie
        tagDataReader.getTags().stream().forEach(t -> canonicalTag.put(t,t));
        int batchNum = inputSeqFiles.size()/batchSize;
       
        if (inputSeqFiles.size() % batchSize !=0) batchNum++;
        System.out.println("ProductionSNPCallerPluginV2: Total batches to process: " + batchNum);

        final PositionList positionList=tagDataReader.getSNPPositions(positionQualityScore());
        if (positionList == null || positionList.size() == 0) {
        	String errMsg = "\nNo snp positons found with quality score of " + positionQualityScore() + ".\n"
        			+ "Please run UpdateSNPPositionQualityPlugin to add quality scores for your positions,\n"
        			+ " then select snp positions within a quality range you have specified.\n";
        	myLogger.error(errMsg);
        	return null;
        }
               
        GenotypeTableBuilder gtb=setUpGenotypeTableBuilder(outputGenotypesFile(), positionList, genoMergeRule);
        final Multimap tagsToIndex=ArrayListMultimap.create();
        tagDataReader.getAlleleMap().entries().stream()
                .forEach(e -> {
                	// indexOf returns -1 if the list doesn't contain the element, which it won't
                	// if there are snpposition entries with a quality score less than minimumQualityScore 
                    int posIndex=positionList.indexOf(e.getValue().position());
                    if (posIndex >= 0) {
                    	tagsToIndex.put(e.getKey(),new AlleleWithPosIndex(e.getValue(),posIndex));
                    }                   
                });
        
        taglenException = false;
        for (int idx = 0; idx < inputSeqFiles.size(); idx+=batchSize) {
        	tagCntMap.clear(); // start fresh with each new batch
            int end = idx+batchSize;
            if (end > inputSeqFiles.size()) end = inputSeqFiles.size();
            ArrayList sub = new ArrayList();
            for (int jdx = idx; jdx < end; jdx++) sub.add(inputSeqFiles.get(jdx));
            System.out.println("\nStart processing batch " + String.valueOf(idx/batchSize+1));
            sub.parallelStream()
            .forEach(inputSeqFile -> {
                try {
                    processFastQFile(masterTaxaList,keyPath, inputSeqFile, enzyme(),canonicalTag,kmerLength(), minimumQualityScore());
                } catch (StringIndexOutOfBoundsException oobe) {
                    oobe.printStackTrace();
                    myLogger.error(oobe.getMessage());
                    setTagLenException();
                    return;
                }              
            });
            if (taglenException == true) return null; // Tag length failure from processFastQ - halt processing
         
            tagCntMap.asMap().entrySet().stream()
            .forEach(e -> {
                callGenotypes(e.getKey(), e.getValue(), tagsToIndex, positionList, genoMergeRule,gtb,depthToOutput());
                //System.out.println(e.x.getName()+ Arrays.toString(Arrays.copyOfRange(e.y,0,10)))); 
            });
            System.out.println("\nFinished processing batch " + String.valueOf(idx/batchSize+1));
        }
 
        if (isHDF5) { // build hdf5 output
            if (keepGenotypesOpen()) {
                gtb.closeUnfinished();
            } else {
                
                gtb.build();
            }
        } else { // VCF output
            // Build genotype
            GenotypeTable myGt = gtb.build();
            ExportUtils.writeToVCF(myGt, outputGenotypesFile(), depthToOutput()); 
        }
 
        writeReadsPerSampleReports(positionList.size());
        return null;
    }

    private static void callGenotypes(Taxon taxon, Collection tags, Multimap tagsToIndex,
                   PositionList positionList, GenotypeMergeRule genoMergeRule, GenotypeTableBuilder gtb, boolean outputDepths) {
        int[][] alleleDepths = new int[NucleotideAlignmentConstants.NUMBER_NUCLEOTIDE_ALLELES][positionList.numberOfSites()];
        tags.stream().map(t -> tagsToIndex.get(t)).flatMap(c -> c.stream())
                .forEach(a -> alleleDepths[a.allele()][a.positionIndex()]++);
        if (outputDepths) {
            byte[][] byteDepths = AlleleDepthUtil.depthIntToByte(alleleDepths);
            gtb.addTaxon(taxon, resolveGenosForTaxon(alleleDepths, genoMergeRule),byteDepths);
        } else {
        	gtb.addTaxon(taxon, resolveGenosForTaxon(alleleDepths, genoMergeRule));
        }
    }

    private class AlleleWithPosIndex extends SimpleAllele {
        private int positionIndex;

        private AlleleWithPosIndex(Allele myAllele, int positionIndex) {
            super(myAllele.allele(), myAllele.position());
            this.positionIndex=positionIndex;
        }

        public int positionIndex() {
            return positionIndex;
        }
    }

    private class CountOfReadQuality {
        LongAdder allReads=new LongAdder();
        LongAdder goodBarcodedReads=new LongAdder();
        LongAdder goodMatched=new LongAdder();
        LongAdder perfectMatches=new LongAdder();
        LongAdder imperfectMatches=new LongAdder();
        LongAdder singleImperfectMatches=new LongAdder();
    }

    private void processFastQFile(TaxaList masterTaxaList, Path keyPath, Path fastQPath, String enzymeName,
            Map canonicalTags, int preferredTagLength, int minQual) throws StringIndexOutOfBoundsException{
        ArrayList tl=GBSUtils.getLaneAnnotatedTaxaList(keyPath, fastQPath);
        BarcodeTrie barcodeTrie=GBSUtils.initializeBarcodeTrie(tl, masterTaxaList, EnzymeList.defaultCache.getEnzyme(enzymeName));
        try {
            processFastQ(fastQPath,barcodeTrie,canonicalTags,preferredTagLength, minQual);
        } catch (StringIndexOutOfBoundsException oobe) {
            throw oobe; // let processData() handle it
        }

    }

    private void processFastQ(Path fastqFile, BarcodeTrie barcodeTrie, Map canonicalTags, 
            int preferredTagLength, int minQual) throws StringIndexOutOfBoundsException {
        int allReads=0, goodBarcodedReads = 0, lowQualityReads = 0;
        try {
            int qualityScoreBase=GBSUtils.determineQualityScoreBase(fastqFile);
            BufferedReader br = Utils.getBufferedReader(fastqFile.toString(), 1 << 22);
            long time=System.nanoTime();
            String[] seqAndQual;
            while ((seqAndQual=GBSUtils.readFastQBlock(br, allReads)) != null) {
                allReads++;
                // Decode barcode using the current sequence & quality  score
                Barcode barcode=barcodeTrie.longestPrefix(seqAndQual[0]);               
                if(barcode==null) continue;
                if(minQual>0) {
                    //todo move getFirstLowQualityPos into this class?
                    if(BaseEncoder.getFirstLowQualityPos(seqAndQual[1],minQual, qualityScoreBase)<(barcode.getBarLength()+preferredTagLength)){
                        lowQualityReads++;
                        continue;
                    }
                }
                rawReadCountsForFullSampleName.put(barcode.getTaxaName(), rawReadCountsForFullSampleName.get(barcode.getTaxaName()) + 1);
                int barcodeLen = barcode.getBarLength();
                if (seqAndQual[0].length() - barcodeLen < preferredTagLength) {
                    String errMsg = "\n\nERROR processing " + fastqFile.toString() + "\n" +
                            "Reading entry number " + allReads + " fails the length test.\n" +
                            "Sequence length " + seqAndQual[0].length() + " minus barcode length "+ barcodeLen +
                            " is less than kmerLength " + preferredTagLength + ".\n" +
                            "Re-run your files with either a shorter kmerLength value or a higher minimum quality score.\n";
                    throw new StringIndexOutOfBoundsException(errMsg);
                }
 
                Tag tag = removeSecondCutSiteIndexOf(seqAndQual[0].substring(barcodeLen),preferredTagLength);
                //Tag tag = removeSecondCutSiteAhoC(seqAndQual[0].substring(barcodeLen),preferredTagLength);
                //Tag tag= TagBuilder.instance(seqAndQual[0].substring(barcode.getBarLength(), barcode.getBarLength() + preferredTagLength)).build();
                if(tag==null) continue;   //null occurs when any base was not A, C, G, T
                goodBarcodedReads++;
                Tag canonicalTag=canonicalTags.get(tag);
                if(canonicalTag!=null) {
                    tagCntMap.put(barcode.getTaxon(),canonicalTag);
                    matchedReadCountsForFullSampleName.put(barcode.getTaxaName(), matchedReadCountsForFullSampleName.get(barcode.getTaxaName()) + 1);
                }
                if (allReads % 1000000 == 0) {
                    myLogger.info("Total Reads:" + allReads + " Reads with barcode and cut site overhang:" + goodBarcodedReads
                            + " rate:" + (System.nanoTime()-time)/allReads +" ns/read");
                }
            }
            myLogger.info("Total number of reads in lane=" + allReads);
            myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
            myLogger.info("Total number of low quality reads=" + lowQualityReads);
            myLogger.info("Timing process (sorting, collapsing, and writing TagCount to file).");
            myLogger.info("Process took " + (System.nanoTime() - time)/1e6 + " milliseconds for file " + fastqFile.toString());
            br.close();
        } catch (Exception e) {
            myLogger.error("Good Barcodes Read: " + goodBarcodedReads);
            e.printStackTrace();
        }
    }


    // Using indexOf() is much faster than using Aho-C trie to find
    // the second cut site.  (junits ran on average of 15.81281 seconds
    // for aho-c after 5 runs, vs 13.53204 for 5 runs using indexOf method)
    private Tag removeSecondCutSiteIndexOf(String seq, int preferredLength) {
        Tag tag = null;
        
        // handle overlapping cutsite for ApeKI enzyme
        if (enzyme().equalsIgnoreCase("ApeKI")) {
            if (seq.startsWith("CAGCTGC") || seq.startsWith("CTGCAGC")) {
                seq = seq.substring(3,seq.length());
            }             
        }
        int indexOfReadEnd = -1;
        String shortSeq = seq.substring(20);
        for (String readEnd: likelyReadEndStrings){
            int indx = shortSeq.indexOf(readEnd);
            if (indx > 0 ) {
                if (indexOfReadEnd < 0 || indx < indexOfReadEnd) {
                    indexOfReadEnd = indx;
                } 
            }
        }
        if (indexOfReadEnd > 0 &&
                (indexOfReadEnd + 20 + readEndCutSiteRemnantLength < preferredLength)) {
            // trim tag to sequence up to & including the cut site
            tag = TagBuilder.instance(seq.substring(0, indexOfReadEnd + 20 + readEndCutSiteRemnantLength)).build();
 
        } else {
            int seqEnd = (byte) Math.min(seq.length(), preferredLength);
            return TagBuilder.instance(seq.substring(0,seqEnd)).build();
        }
        return tag;
    }
    private Tag removeSecondCutSiteAhoC(String seq, int preferredLength) {
        // Removes the second cut site BEFORE we trim the tag.
        // this preserves the cut site incase it shows up in the middle       
        Tag tag = null;
        int cutSiteIndex = 9999;
        
        // handle overlapping cutsite for ApeKI enzyme
        if (enzyme().equalsIgnoreCase("ApeKI")) {
            if (seq.startsWith("CAGCTGC") || seq.startsWith("CTGCAGC")) {
                seq = seq.substring(3,seq.length());
            }             
        }
        // use ahoCorasickTrie to find cut site       
        // Start at seq+20 since 20 is default minimum length
        Collection emits = null;
        try {
            emits = ahoCorasickTrie.parseText(seq.substring(20));
        } catch (Exception emitsEx) {
            System.out.println("ahoCorasick excep: seq: " + seq);
            emitsEx.printStackTrace();
            // proceed as if no tag was found.  need to understand why sometimes
            // aho-c complains of a null pointer
            int seqEnd = (byte) Math.min(seq.length(), preferredLength);
            return TagBuilder.instance(seq.substring(0,seqEnd)).build();
        }
        
        // Using the above as debug, when we find more than 1, the items in the list
        // appear in the order they appear in the seq.  So we only need to look at the first one
        for (Emit emit: emits) {
            cutSiteIndex = emit.getStart();
            break;
        }

        if (cutSiteIndex + 20 + readEndCutSiteRemnantLength < preferredLength) { // add 20 as we cut off first 20 for aho-c parsing
            // trim tag to sequence up to & including the cut site
            tag = TagBuilder.instance(seq.substring(0, cutSiteIndex + 20 + readEndCutSiteRemnantLength)).build();
        } else {
            // if no cut site found, or it is beyond preferred length
            int seqEnd = (byte) Math.min(seq.length(), preferredLength);
            tag = TagBuilder.instance(seq.substring(0,seqEnd)).build();
        }
        return tag;       
    }
    private void reportProgress(int[] counters, long readSeqReadTime, long ifRRNotNullTime) {
        myLogger.info(
                "totalReads:" + counters[0]
                + "  goodBarcodedReads:" + counters[1]
                + "  goodMatchedToTOPM:" + counters[2]
                //            + "  perfectMatches:" + counters[3]
                //            + "  nearMatches:" + counters[4]
                //            + "  uniqueNearMatches:" + counters[5]
                + "  cumulReadSequenceTime: " + ((double) (readSeqReadTime) / 1_000_000_000.0) + " sec"
                + "  cumulProcessSequenceTime: " + ((double) (ifRRNotNullTime) / 1_000_000_000.0) + " sec"
        );
    }

    private void reportTotals(Path fileName, int[] counters, int nFilesProcessed) {
        myLogger.info("Total number of reads in lane=" + counters[0]);
        myLogger.info("Total number of good, barcoded reads=" + counters[1]);
        myLogger.info("Total number of good, barcoded reads matched to the TOPM=" + counters[2]);
        myLogger.info("Finished reading " + nFilesProcessed + " of " + seqFilesInKeyAndDir.size() + " sequence files: " + fileName + "\n");
    }


    private static GenotypeTableBuilder setUpGenotypeTableBuilder(String anOutputFile, PositionList positionList, GenotypeMergeRule mergeRule) {
        if (isHDF5) {
            File hdf5File = new File(anOutputFile);
            if (hdf5File.exists()) {
                myLogger.info("\nGenotypes will be added to existing HDF5 file:\n  " + anOutputFile + "\n");
                return GenotypeTableBuilder.mergeTaxaIncremental(anOutputFile, mergeRule);
            } else {
                myLogger.info("\nThe target HDF5 file:\n  " + anOutputFile
                        + "\ndoes not exist. A new HDF5 file of that name will be created \nto hold the genotypes from this run.");
                return GenotypeTableBuilder.getTaxaIncrementalWithMerging(anOutputFile, positionList, mergeRule);
            }
        } else { // create genotype table for VCF
            GenotypeTableBuilder gtb = GenotypeTableBuilder.getTaxaIncremental(positionList, mergeRule);
            myLogger.info("\nOutput VCF file: \n" + anOutputFile +
                    " \ncreated for genotypes from this run.");
            return gtb;
        }
    }

    private static byte[] resolveGenosForTaxon(int[][] depthsForTaxon, GenotypeMergeRule genoMergeRule) {
        int nAlleles = depthsForTaxon.length;
        int[] depthsAtSite = new int[nAlleles];
        int nSites = depthsForTaxon[0].length;
        byte[] genos = new byte[nSites];
        for (int site = 0; site < nSites; site++) {
            for (int allele = 0; allele < nAlleles; allele++) {
                depthsAtSite[allele] = depthsForTaxon[allele][site];
            }
            genos[site] = genoMergeRule.callBasedOnDepth(depthsAtSite);
        }
        return genos;
    }

    private void writeReadsPerSampleReports(int tagsProcessed) {
        myLogger.info("\nWriting ReadsPerSample log file...");
        String outFileS = myOutputDir + File.separator + (new File(keyFile())).getName();
        outFileS = outFileS.replaceAll(".txt", "_ReadsPerSample.log");
        outFileS = outFileS.replaceAll("_key", "");
        try {
        	String msg = "ReadsPerSample log file: " + outFileS;
        	myLogger.info(msg);
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outFileS))), 65536);
            bw.write("FullSampleName\t\t\tgoodBarcodedReads\tgoodReadsMatchedToDataBase\n");
            for (String fullSampleName : rawReadCountsForFullSampleName.keySet()) {
                bw.write(fullSampleName + "\t" + rawReadCountsForFullSampleName.get(fullSampleName) + "\t\t" + matchedReadCountsForFullSampleName.get(fullSampleName) + "\n");
            }
            bw.close();
        } catch (Exception e) {
            myLogger.error("Couldn't write to ReadsPerSample log file: " + e);
            e.printStackTrace();
            System.exit(1);
        }
        myLogger.info("\n\nTotal number of SNPs processed with minimum quality score " + minimumQualityScore() + " was " + tagsProcessed + ".\n");
        myLogger.info("   ...done\n");
    }
    
    private void writeInitialTaxaReadCounts(TaxaList tl) {
    	tl.stream() // Add initial taxa names with count of 0 to synchronized maps
    	.forEach(taxon -> {
    		 rawReadCountsForFullSampleName.put(taxon.getName(), 0); 
    	     matchedReadCountsForFullSampleName.put(taxon.getName(), 0);
    	});
    }

    public void setTagLenException() {
        taglenException = true;
    }
    
    private void printFileNameConventions(String actualFileName) {
        String message
                = "\n\n"
                + "Error in parsing file name:"
                + "\n   The raw sequence filename does not contain either 3, 4, or 5 underscore-delimited values."
                + "\n   Acceptable file naming conventions include the following (where FLOWCELL indicates the flowcell name and LANE is an integer):"
                + "\n       FLOWCELL_LANE_fastq.gz"
                + "\n       FLOWCELL_s_LANE_fastq.gz"
                + "\n       code_FLOWCELL_s_LANE_fastq.gz"
                + "\n       FLOWCELL_LANE_fastq.txt.gz"
                + "\n       FLOWCELL_s_LANE_fastq.txt.gz"
                + "\n       code_FLOWCELL_s_LANE_fastq.txt.gz"
                + "\n       FLOWCELL_LANE_qseq.txt.gz"
                + "\n       FLOWCELL_s_LANE_qseq.txt.gz"
                + "\n       code_FLOWCELL_s_LANE_qseq.txt.gz"
                + "\n"
                + "\n   Actual Filename: " + actualFileName
                + "\n\n";

        myLogger.error(message);
    }

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Production SNP Caller";
    }

    @Override
    public String getToolTipText() {
        return "Production SNP Caller";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(ProductionSNPCallerPluginV2.class);
    // }

    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
    public TagData runPlugin(DataSet input) {
        return (TagData) performFunction(input).getData(0).getData();
    }

    /**
     * Input directory containing fastq AND/OR qseq files.
     *
     * @return Input Directory
     */
    public String inputDirectory() {
        return myInputDir.value();
    }

    /**
     * Set Input Directory. Input directory containing fastq
     * AND/OR qseq files.
     *
     * @param value Input Directory
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 inputDirectory(String value) {
        myInputDir = new PluginParameter<>(myInputDir, value);
        return this;
    }

    /**
     * Key file listing barcodes distinguishing the samples
     *
     * @return Key File
     */
    public String keyFile() {
        return myKeyFile.value();
    }

    /**
     * Set Key File. Key file listing barcodes distinguishing
     * the samples
     *
     * @param value Key File
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 keyFile(String value) {
        myKeyFile = new PluginParameter<>(myKeyFile, value);
        return this;
    }

    /**
     * Enzyme used to create the GBS library
     *
     * @return Enzyme
     */
    public String enzyme() {
        return myEnzyme.value();
    }

    /**
     * Set Enzyme. Enzyme used to create the GBS library
     *
     * @param value Enzyme
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 enzyme(String value) {
        myEnzyme = new PluginParameter<>(myEnzyme, value);
        return this;
    }

    /**
     * Input Database file if using SQLite
     *
     * @return Input GBS Database
     */
    public String inputGBSDatabase() {
        return myInputDB.value();
    }

    /**
     * Set Input GBS Database. Input Database file if using
     * SQLite
     *
     * @param value Input GBS Database
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 inputGBSDatabase(String value) {
        myInputDB = new PluginParameter<>(myInputDB, value);
        return this;
    }

    /**
     * Output (target) genotypes file to add new genotypes
     * to (new file created if it doesn't exist)
     *
     * @return Output file Genotypes File
     */
    public String outputGenotypesFile() {
        return myOutputGenotypes.value();
    }

    /**
     * Set Output  Genotypes File. Output (target)
     * genotypes file to add new genotypes to (new file created
     * if it doesn't exist)
     *
     * @param value Output Genotypes File
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 outputGenotypesFile(String value) {
        myOutputGenotypes = new PluginParameter<>(myOutputGenotypes, value);
        return this;
    }

    /**
     * Average sequencing error rate per base (used to decide
     * between heterozygous and homozygous calls)
     *
     * @return Ave Seq Error Rate
     */
    public Double aveSeqErrorRate() {
        return myAveSeqErrorRate.value();
    }

    /**
     * Set Ave Seq Error Rate. Average sequencing error rate
     * per base (used to decide between heterozygous and homozygous
     * calls)
     *
     * @param value Ave Seq Error Rate
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 aveSeqErrorRate(Double value) {
        myAveSeqErrorRate = new PluginParameter<>(myAveSeqErrorRate, value);
        return this;
    }

    /**
     * Maximum divergence (edit distance) between new read
     * and previously mapped read (Default: 0 = perfect matches
     * only)
     *
     * @return Max Divergence
     */
    public Integer maxDivergence() {
        return myMaxDivergence.value();
    }

    /**
     * Set Max Divergence. Maximum divergence (edit distance)
     * between new read and previously mapped read (Default:
     * 0 = perfect matches only)
     *
     * @param value Max Divergence
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 maxDivergence(Integer value) {
        myMaxDivergence = new PluginParameter<>(myMaxDivergence, value);
        return this;
    }

    /**
     * Keep hdf5 genotypes open for future runs that add more
     * taxa or more depth
     *
     * @return Keep Genotypes Open
     */
    public Boolean keepGenotypesOpen() {
        return myKeepGenotypesOpen.value();
    }

    /**
     * Set Keep Genotypes Open. Keep hdf5 genotypes open for
     * future runs that add more taxa or more depth
     *
     * @param value Keep Genotypes Open
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 keepGenotypesOpen(Boolean value) {
        myKeepGenotypesOpen = new PluginParameter<>(myKeepGenotypesOpen, value);
        return this;
    }

    /**
     * Output depth: write depths to the output
     * hdf5 genotypes file
     *
     * @return Depth to Output - true or false
     */
    public Boolean depthToOutput() {
        return myDepthOutput.value();
    }

    /**
     * User sets true or false, indicating if they do
     * or do not want depth information written to the
     * HDF5 file.
     *
     * @param value Write depth to output file
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 depthToOutput(Boolean value) {
        myDepthOutput = new PluginParameter<>(myDepthOutput, value);
        return this;
    }
    /**
     * Maximum Tag Length
     *
     * @return Maximum Tag Length
     */
    public Integer kmerLength() {
        return myKmerLength.value();
    }

    /**
     * Set Maximum Tag Length:  User should set this value
     * equivalent to what was used in GBSSeqToTagDBPlugin
     * for maximum tag length when creating the database.
     * If the two values are not equal inconsistent results
     * may occur.
     *
     * @param value Maximum Tag Length
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 kmerLength(Integer value) {
        myKmerLength = new PluginParameter<>(myKmerLength, value);
        return this;
    }
    /**
     *  Minimum Position Quality Score
     *
     * @return Minimum position quality score
     */
    public Double positionQualityScore() {
        return posQualityScore.value();
    }

    /**
     * Set Minimum quality score for position:  This value is used to pull
     * SNPs out of the snpposition table.  Only snps with quality
     * scores meeting or exceeding the specified value will be 
     * processed.
     *
     * @param value Minimum position quality score
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 positionQualityScore(Double value) {
        posQualityScore = new PluginParameter<>(posQualityScore, value);
        return this;
    }
    
    /**
     *  Batch size for processing fastq files
     *
     * @return batchSize
     */
    public Integer batchSize() {
        return myBatchSize.value();
    }
    /**
     * Set number of Fastq files processed simultaneously
     * @param value
     * @return
     */
    public ProductionSNPCallerPluginV2 batchSize(Integer value) {
        myBatchSize = new PluginParameter<>(myBatchSize, value);
        return this;
    }
    /**
     * Minimum quality score within the barcode and read length
     * to be accepted
     *
     * @return Minimum quality score
     */
    public Integer minimumQualityScore() {
        return myMinQualScore.value();
    }

    /**
     * Set Minimum quality score. Minimum quality score within
     * the barcode and read length to be accepted
     *
     * @param value Minimum quality score
     *
     * @return this plugin
     */
    public ProductionSNPCallerPluginV2 minimumQualityScore(Integer value) {
        myMinQualScore = new PluginParameter<>(myMinQualScore, value);
        return this;
    }
    /**
     * Use STACKS likelihood method to call heterozygotes (default: use
     * tasselGBS likelihood ratio method)
     *
     * @return Use Stacks Likelihood
     */
    //public Boolean useStacksLikelihood() {
    //    return myStacksLikelihood.value();
    //}
    /**
     * Set Use Stacks Likelihood. Use STACKS likelihood method to call
     * heterozygotes (default: use tasselGBS likelihood ratio method)
     *
     * @param value Use Stacks Likelihood
     *
     * @return this plugin
     */
    //public ProductionSNPCallerPlugin useStacksLikelihood(Boolean value) {
    //    myStacksLikelihood = new PluginParameter<>(myStacksLikelihood, value);
    //    return this;
    //}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy