net.maizegenetics.analysis.gbs.ProductionSNPCallerPlugin Maven / Gradle / Ivy

Go to download
/*
 * ProductionSNPCallerPlugin
 */
package net.maizegenetics.analysis.gbs;

import cern.colt.list.IntArrayList;
import com.google.common.collect.ImmutableMap;

import com.google.common.collect.ImmutableTable;
import com.google.common.collect.Multimap;
import com.google.common.collect.Table;
import com.google.common.collect.TreeMultimap;

import net.maizegenetics.dna.map.*;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.MultiMemberGZIPInputStream;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.dna.snp.genotypecall.BasicGenotypeMergeRule;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.Taxon;

import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.*;
import java.util.*;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.util.GeneralAnnotation;

/**
 * This plugin converts all of the fastq (and/or qseq) files in the input folder
 * and keyfile to genotypes and adds these to a genotype file in HDF5 format.
 *
 * We refer to this step as the "Production Pipeline".
 *
 * The output format is HDF5 genotypes with allelic depth stored. SNP calling is
 * quantitative with the option of using either the Glaubitz/Buckler binomial
 * method (pHet/pErr > 1 = het) (=default), or the Stacks method.
 *
 * Merging of samples with the same LibraryPrepID is handled by
 * GenotypeTableBuilder.addTaxon(), with the genotypes re-called based upon the
 * new depths. Therefore, if you want to keep adding genotypes to the same
 * target HDF5 file in subsequent runs, use the -ko (keep open) option so that
 * the output GenotypeTableBuilder will be mutable, using closeUnfinished()
 * rather than build().
 *
 * If the target output HDF5 GenotypeTable file doesn't exist, it will be
 * created.
 *
 * Each taxon in the HDF5 file is named "ShortName:LibraryPrepID" and is
 * annotated with "Flowcell_Lanes" (=source seq data for current genotype).
 *
 * Requires a TOPM with variants added from a previous "Discovery Pipeline" run.
 * In binary topm or HDF5 format (TOPMInterface).
 *
 * TODO add the Stacks likelihood method to BasicGenotypeMergeRule
 *
 * @author Jeff Glaubitz
 * @author Ed Buckler
 */
public class ProductionSNPCallerPlugin extends AbstractPlugin {

    private static final Logger myLogger = Logger.getLogger(ProductionSNPCallerPlugin.class);

    private PluginParameter myInputDirectory = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true).inDir()
            .description("Input directory containing fastq AND/OR qseq files.").build();
    private PluginParameter myKeyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
            .description("Key file listing barcodes distinguishing the samples").build();
    private PluginParameter myEnzyme = new PluginParameter.Builder<>("e", null, String.class).guiName("Enzyme").required(true)
            .description("Enzyme used to create the GBS library").build();
    private PluginParameter myProductionTOPM = new PluginParameter.Builder<>("m", null, String.class).guiName("Input TOPM File").required(true).inFile()
            .description("Physical map file containing tags and corresponding variants (production TOPM)").build();
    private PluginParameter myOutputGenotypes = new PluginParameter.Builder<>("o", null, String.class).guiName("Output HDF5 Genotypes File").required(true).outFile()
            .description("Output (target) HDF5 genotypes file to add new genotypes to (new file created if it doesn't exist)").build();
    private PluginParameter myAveSeqErrorRate = new PluginParameter.Builder<>("eR", 0.01, Double.class).guiName("Ave Seq Error Rate")
            .description("Average sequencing error rate per base (used to decide between heterozygous and homozygous calls)").build();
//    private PluginParameter myMaxDivergence = new PluginParameter.Builder<>("d", 0, Integer.class).guiName("Max Divergence")
//            .description("Maximum divergence (edit distance) between new read and previously mapped read (Default: 0 = perfect matches only)").build();
    private PluginParameter myKeepGenotypesOpen = new PluginParameter.Builder<>("ko", false, Boolean.class).guiName("Keep Genotypes Open")
            .description("Keep hdf5 genotypes open for future runs that add more taxa or more depth").build();
    private PluginParameter myNoDepthOutput = new PluginParameter.Builder<>("ndo", false, Boolean.class).guiName("No Depth to Output")
            .description("No depth output: do not write depths to the output hdf5 genotypes file").build();
    //private PluginParameter myStacksLikelihood = new PluginParameter.Builder<>("sL", false, Boolean.class).guiName("Use Stacks Likelihood")
    //        .description("Use STACKS likelihood method to call heterozygotes (default: use tasselGBS likelihood ratio method)").build();

    private String[] myRawSeqFileNames = null;
    private String myOutputDir = null;
    private TOPMInterface topm = null;
    private int[] chromosomes = null;
    private boolean fastq = true;
    private Map keyFileColumns = new HashMap<>();
    private Set flowcellLanesInKey = new TreeSet<>();  // flowcellLanes present in key file (stored here as "Flowcell_Lane")
    private Map seqFileNameToFlowcellLane = new HashMap<>(); // map from name of fastq or qseq file to "Flowcell_Lane"
    private Set seqFilesInKeyAndDir = new TreeSet<>(); // fastq (or qseq) file names present in input directory that have a "Flowcell_Lane" in the key file
    private Map fullNameToHDF5Name = new TreeMap<>();
    private Multimap flowCellLaneToLibPrepIDs = TreeMultimap.create();
    private Map libraryPrepIDToSampleName = new TreeMap();
    
    private GenotypeTableBuilder genos = null; //output genotype table
    private TaxaList taxaList = null;
    private PositionList myPositionList = null;
    private IntArrayList[] obsTagsForEachTaxon = null;
    private Table positionToSite = null;  // indices = chrIndices.  For a given position (key), each HashMap provides the site in the MutableNucleotideDepthAlignment (value)

    //Documentation of read depth per sample (one recored per replicate)
    private Map rawReadCountsForFullSampleName = new TreeMap<>();
    private Map matchedReadCountsForFullSampleName = new TreeMap<>();

    private BasicGenotypeMergeRule genoMergeRule = null;

    public ProductionSNPCallerPlugin() {
        super(null, false);
    }

    public ProductionSNPCallerPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public void postProcessParameters() {

        File rawSeqDirectory = new File(inputDirectory());
        myRawSeqFileNames = DirectoryCrawler.listFileNames(rawSeqFileNameRegex, rawSeqDirectory.getAbsolutePath());
        if (myRawSeqFileNames.length == 0 || myRawSeqFileNames == null) {
            throw new IllegalArgumentException(noMatchingRawSeqFileNamesMessage + inputDirectory());
        } else {
            Arrays.sort(myRawSeqFileNames);
            String message = "\nProductionSNPCallerPlugin:\n\nThe following GBS raw sequence data files were found in the input folder (and sub-folders):";
            for (String filename : myRawSeqFileNames) {
                message += "\n   " + filename;
            }
            myLogger.info(message + "\n");
        }

        topm = TOPMUtils.readTOPM(inputTOPMFile());
        if (topm.getSize() == 0) {
            throw new IllegalStateException("TagsOnPhysicalMap file not available or is empty");
        }

        try {
            myOutputDir = (new File(outputHDF5GenotypesFile())).getCanonicalFile().getParent();
        } catch (IOException e) {
            throw new IllegalStateException("Problem resolving output directory:" + e);
        }
    }

    @Override
    public DataSet processData(DataSet input) {
        long previous, current;
        readKeyFile();  // TODO: read/write full set of metadata
        matchKeyFileToAvailableRawSeqFiles();
        myPositionList = getUniquePositions();
        generateFastSiteLookup(myPositionList);
        setUpGenotypeTableBuilder();
        int nFilesProcessed = 0;
        for (int fileNum = 0; fileNum < myRawSeqFileNames.length; fileNum++) {
            if (!seqFilesInKeyAndDir.contains(myRawSeqFileNames[fileNum])) {
                continue;  // skip fastq/qseq files that are not in the key file
            }
            int[] counters = {0, 0, 0, 0, 0, 0}; // 0:allReads 1:goodBarcodedReads 2:goodMatched 3:perfectMatches 4:imperfectMatches 5:singleImperfectMatches
            myLogger.info("\nLooking for known SNPs in sequence reads from file:");
            myLogger.info("  " + myRawSeqFileNames[fileNum] + "\n");
            previous = System.nanoTime();
            readRawSequencesAndRecordDepth(fileNum, counters);  // TODO: read the machine name from the fastq/qseq file
            current = System.nanoTime();
            System.out.println("ProductionSNPCallerPlugin: performFunction: readRawSequencesAndRecordDepth: " + myRawSeqFileNames[fileNum] + ": " + ((double) (current - previous) / 1_000_000_000.0) + " sec");
            previous = System.nanoTime();
            callGenotypes();
            current = System.nanoTime();
            System.out.println("ProductionSNPCallerPlugin: performFunction: callGenotypes: " + myRawSeqFileNames[fileNum] + ": " + ((double) (current - previous) / 1_000_000_000.0) + " sec");
            ++nFilesProcessed;
            reportTotals(fileNum, counters, nFilesProcessed);
        }
        if (keepGenotypesOpen()) {
            previous = System.nanoTime();
            genos.closeUnfinished();
            current = System.nanoTime();
            System.out.println("ProductionSNPCallerPlugin: performFunction: genos.closeUnfinished(): " + ((double) (current - previous) / 1_000_000_000.0) + " sec");
        } else {
            previous = System.nanoTime();
            genos.build();
            current = System.nanoTime();
            System.out.println("ProductionSNPCallerPlugin: performFunction: genos.build(): " + ((double) (current - previous) / 1_000_000_000.0) + " sec");
        }
        writeReadsPerSampleReports();
        return null;
    }

    private void readRawSequencesAndRecordDepth(int fileNum, int[] counters) {
        long previous, current;
        ParseBarcodeRead thePBR = setUpBarcodes(fileNum);
        if (thePBR == null || thePBR.getBarCodeCount() == 0) {
            myLogger.info("No barcodes found. Skipping this raw sequence file.");
            return;
        }
        taxaList = new TaxaListBuilder().addAll(getHDF5Taxa(fileNum)).sortTaxaAlphabetically().build();
        obsTagsForEachTaxon = new IntArrayList[taxaList.numberOfTaxa()];
        for (int t = 0; t < obsTagsForEachTaxon.length; t++) {
            obsTagsForEachTaxon[t] = new IntArrayList(750_000); // initial capacity
        }
        String temp = "Nothing has been read from the raw sequence file yet";
        BufferedReader br = getBufferedReaderForRawSeqFile(fileNum);
        try {
            long readSeqReadTime = 0;
            long ifRRNotNullTime = 0;
            while ((temp = br.readLine()) != null) {
                if (counters[0] % 1000000 == 0) {
                    reportProgress(counters, readSeqReadTime, ifRRNotNullTime);
                }
                previous = System.nanoTime();
                ReadBarcodeResult rr = readSequenceRead(br, temp, thePBR, counters);
                current = System.nanoTime();
                readSeqReadTime += (current - previous);
                previous = System.nanoTime();
                if (rr != null) {
                    counters[1]++;  // goodBarcodedReads
                    rawReadCountsForFullSampleName.put(rr.getTaxonName(), rawReadCountsForFullSampleName.get(rr.getTaxonName()) + 1);
                    int tagIndex = topm.getTagIndex(rr.getRead());
                    if (tagIndex >= 0) {
                        counters[3]++;  // perfectMatches
                    }
//                    if (tagIndex < 0 && maxDivergence() > 0) {
//                        tagIndex = findBestImperfectMatch(rr.getRead(), counters);
//                    }
                    if (tagIndex < 0) {
                        continue;
                    }
                    counters[2]++;  // goodMatched++;
                    matchedReadCountsForFullSampleName.put(rr.getTaxonName(), matchedReadCountsForFullSampleName.get(rr.getTaxonName()) + 1);
                    int taxonIndex = taxaList.indexOf(fullNameToHDF5Name.get(rr.getTaxonName()));
                    if (taxonIndex == -1) {
                        System.out.println("We're here!");
                    }
                    obsTagsForEachTaxon[taxonIndex].add(tagIndex);
                }
                current = System.nanoTime();
                ifRRNotNullTime += (current - previous);
            }
            System.out.println("ProductionSNPCallerPlugin: readRawSequencesAndRecordDepth: readSequenceTime: " + ((double) (readSeqReadTime) / 1_000_000_000.0) + " sec");
            System.out.println("ProductionSNPCallerPlugin: readRawSequencesAndRecordDepth: processSequenceTime: " + ((double) (ifRRNotNullTime) / 1_000_000_000.0) + " sec");
            br.close();
        } catch (Exception e) {
            myLogger.error("Catch in readRawSequencesAndRecordDepth() at nReads=" + counters[0] + " e=" + e);
            myLogger.error("Last line read: " + temp);
            e.printStackTrace();
            System.exit(1);
        }
    }

    private void reportProgress(int[] counters, long readSeqReadTime, long ifRRNotNullTime) {
        myLogger.info(
                "totalReads:" + counters[0]
                + "  goodBarcodedReads:" + counters[1]
                + "  goodMatchedToTOPM:" + counters[2]
                //            + "  perfectMatches:" + counters[3]
                //            + "  nearMatches:" + counters[4]
                //            + "  uniqueNearMatches:" + counters[5]
                + "  cumulReadSequenceTime: " + ((double) (readSeqReadTime) / 1_000_000_000.0) + " sec"
                + "  cumulProcessSequenceTime: " + ((double) (ifRRNotNullTime) / 1_000_000_000.0) + " sec"
        );
    }

    private void reportTotals(int fileNum, int[] counters, int nFilesProcessed) {
        myLogger.info("Total number of reads in lane=" + counters[0]);
        myLogger.info("Total number of good, barcoded reads=" + counters[1]);
        myLogger.info("Total number of good, barcoded reads matched to the TOPM=" + counters[2]);
        myLogger.info("Finished reading " + nFilesProcessed + " of " + seqFilesInKeyAndDir.size() + " sequence files: " + myRawSeqFileNames[fileNum] + "\n");
    }

    private void readKeyFile() {
        flowcellLanesInKey.clear();
        seqFileNameToFlowcellLane.clear();
        seqFilesInKeyAndDir.clear();
        fullNameToHDF5Name.clear();
        flowCellLaneToLibPrepIDs.clear();
        libraryPrepIDToSampleName.clear();
        String inputLine = "Nothing has been read from the keyfile yet";
        try {
            BufferedReader br = new BufferedReader(new FileReader(keyFile()), 65536);
            int currLine = 0;
            while ((inputLine = br.readLine()) != null) {
                if (currLine == 0) {
                    parseKeyFileHeader(inputLine);
                } else {
                    populateKeyFileFields(inputLine);
                }
                currLine++;
            }
        } catch (Exception e) {
            myLogger.error("Couldn't read key file: " + e);
            myLogger.error("Last line read from key file: " + inputLine);
            e.printStackTrace();
            System.exit(1);
        }
    }

    private void parseKeyFileHeader(String headerLine) {
        headerLine.trim();
        String[] header = headerLine.split("\\t");
        keyFileColumns.clear();
        for (int col = 0; col < header.length; col++) {
            if (header[col].equalsIgnoreCase("Flowcell")) {
                keyFileColumns.put("Flowcell", col);
            } else if (header[col].equalsIgnoreCase("Lane")) {
                keyFileColumns.put("Lane", col);
            } else if (header[col].equalsIgnoreCase("Barcode")) {
                keyFileColumns.put("Barcode", col);
            } else if (header[col].equalsIgnoreCase("DNASample") || header[col].equalsIgnoreCase("Sample")) {
                keyFileColumns.put("Sample", col);
            } else if (header[col].equalsIgnoreCase("LibraryPrepID")) {
                keyFileColumns.put("LibPrepID", col);
            }
        }
        if (!confirmKeyFileHeader()) {
            throwBadKeyFileError();
        }
    }

    private boolean confirmKeyFileHeader() {
        // check the headers
        if (!keyFileColumns.containsKey("Flowcell")) {
            return false;
        }
        if (!keyFileColumns.containsKey("Lane")) {
            return false;
        }
        if (!keyFileColumns.containsKey("Barcode")) {
            return false;
        }
        if (!keyFileColumns.containsKey("Sample")) {
            return false;
        }
        if (!keyFileColumns.containsKey("LibPrepID")) {
            return false;
        }

        // check the column order (needed for ParseBarcodeReads)
        if (!keyFileColumns.get("Flowcell").equals(0)) {
            return false;
        }
        if (!keyFileColumns.get("Lane").equals(1)) {
            return false;
        }
        if (!keyFileColumns.get("Barcode").equals(2)) {
            return false;
        }
        if (!keyFileColumns.get("Sample").equals(3)) {
            return false;
        }
        if (!keyFileColumns.get("LibPrepID").equals(7)) {
            return false;
        }
        return true;
    }

    private void throwBadKeyFileError() {
        String badKeyFileMessage
                = "\n\nThe keyfile does not conform to expections.\n"
                + "It must contain columns with the following (exact) headers, and\n"
                + "in the indicated columns:\n"
                + "   \"Flowcell\"                 (column A)\n"
                + "   \"Lane\"                     (column B)\n"
                + "   \"Barcode\"                  (column C)\n"
                + "   \"DNASample\" or \"Sample\"    (column D)\n"
                + "   \"LibraryPrepID\"            (column H)\n"
                + "\n\n";
        try {
            throw new IllegalStateException(badKeyFileMessage);
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

    private void populateKeyFileFields(String keyFileLine) {
        keyFileLine.trim();
        String[] cells = keyFileLine.split("\\t");

        String sample = cells[keyFileColumns.get("Sample")];
        String libPrepID = cells[keyFileColumns.get("LibPrepID")];
        String fullName = sample + ":" + cells[keyFileColumns.get("Flowcell")] + ":" + cells[keyFileColumns.get("Lane")] + ":" + libPrepID;
        rawReadCountsForFullSampleName.put(fullName, 0);
        matchedReadCountsForFullSampleName.put(fullName, 0);
        fullNameToHDF5Name.put(fullName, sample + ":" + libPrepID);

        String flowCell_Lane = cells[keyFileColumns.get("Flowcell")] + "_" + cells[keyFileColumns.get("Lane")];
        flowcellLanesInKey.add(flowCell_Lane);
        flowCellLaneToLibPrepIDs.put(flowCell_Lane, libPrepID);

        String prevSample = libraryPrepIDToSampleName.get(libPrepID);
        if (prevSample == null) {
            libraryPrepIDToSampleName.put(libPrepID, sample);
        } else if (!prevSample.contentEquals(sample)) {
            try {
                throw new IllegalStateException("\nThe key file contains different Sample names (\"" + prevSample + "\" and \"" + sample + "\") for the sample LibraryPrepID (" + libPrepID + ")\n\n");
            } catch (Exception e) {
                myLogger.error("Error in key file: " + e);
                e.printStackTrace();
                System.exit(1);
            }
        }
    }

    private void matchKeyFileToAvailableRawSeqFiles() {
        String message
                = "\nThe following raw sequence files in the input directory conform to one of our file naming conventions and have corresponding samples in the barcode key file:";
        for (int fileNum = 0; fileNum < myRawSeqFileNames.length; fileNum++) {
            String[] flowcellLane = parseRawSeqFileName(myRawSeqFileNames[fileNum]);
            if (flowcellLane != null && flowcellLanesInKey.contains(flowcellLane[0] + "_" + flowcellLane[1])) {
                seqFileNameToFlowcellLane.put(myRawSeqFileNames[fileNum], flowcellLane[0] + "_" + flowcellLane[1]);
                seqFilesInKeyAndDir.add(myRawSeqFileNames[fileNum]);
                message += "\n  " + myRawSeqFileNames[fileNum];
            }
        }
        message += "\n";
        myLogger.info(message);
    }

    private ParseBarcodeRead setUpBarcodes(int fileNum) {
        System.gc();
        String message = "\nWorking on GBS raw sequence file: " + myRawSeqFileNames[fileNum];
        fastq = true;
        if (new File(myRawSeqFileNames[fileNum]).getName().contains("qseq")) {
            fastq = false;
        }
        if (fastq) {
            message += "\n\tThis file is assumed to be in fastq format";
        } else {
            message += "\n\tThis file contains 'qseq' in its name so is assumed to be in qseq format";
        }
        String[] flowcellLane = parseRawSeqFileName(myRawSeqFileNames[fileNum]);
        if (flowcellLane == null) {
            myLogger.info(message);
            return null;
        } else {
            ParseBarcodeRead thePBR = new ParseBarcodeRead(keyFile(), enzyme(), flowcellLane[0], flowcellLane[1]);
            message += "\nTotal barcodes found in key file for this lane:" + thePBR.getBarCodeCount() + "\n";
            myLogger.info(message);
            return thePBR;
        }
    }

    /**
     * Parses out the flowcell and lane from the raw GBS sequence filename
     * (fastq or qseq file)
     *
     * @param rawSeqFileName
     * @return String[2] where element[0]=flowcell and element[1]=lane
     */
    private String[] parseRawSeqFileName(String rawSeqFileName) {
        File rawSeqFile = new File(rawSeqFileName);
        String[] FileNameParts = rawSeqFile.getName().split("_");
        if (FileNameParts.length == 3) {
            return new String[]{FileNameParts[0], FileNameParts[1]};
        } else if (FileNameParts.length == 4) {
            return new String[]{FileNameParts[0], FileNameParts[2]};
        } else if (FileNameParts.length == 5) {
            return new String[]{FileNameParts[1], FileNameParts[3]};
        } else {
            printFileNameConventions(rawSeqFileName);
            return null;
        }
    }

    private void setUpGenotypeTableBuilder() {
        genoMergeRule = new BasicGenotypeMergeRule(aveSeqErrorRate());
        File hdf5File = new File(outputHDF5GenotypesFile());
        if (hdf5File.exists()) {
            myLogger.info("\nGenotypes will be added to existing HDF5 file:\n  " + outputHDF5GenotypesFile() + "\n");
            genos = GenotypeTableBuilder.mergeTaxaIncremental(outputHDF5GenotypesFile(), genoMergeRule);
        } else {
            myLogger.info("\nThe target HDF5 file:\n  " + outputHDF5GenotypesFile()
                    + "\ndoes not exist. A new HDF5 file of that name will be created \nto hold the genotypes from this run.");
            genos = GenotypeTableBuilder.getTaxaIncrementalWithMerging(outputHDF5GenotypesFile(), myPositionList, genoMergeRule);
        }
    }

    /**
     * Gets an ArrayList of taxa, each named "Sample:LibraryPrepID" and annotated
     * with Flowcell_Lane as well as all other annotations in the key file, for 
     * the corresponding fastq file.
     *
     * @return ArrayList
     */
    private ArrayList getHDF5Taxa(int fileNum) {
        String currFlowcellLane = seqFileNameToFlowcellLane.get(myRawSeqFileNames[fileNum]);
        String[] flowcellLane = currFlowcellLane.split("_");
        TaxaList annoTL = TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), "LibraryPrepID",
                ImmutableMap.of("Flowcell", flowcellLane[0], "Lane", flowcellLane[1]), false);
        ArrayList taxaAL = new ArrayList();
        for (Taxon tax : annoTL) {
            GeneralAnnotation annotation = tax.getAnnotation();
            String newName = annotation.getTextAnnotation("Sample").length == 0 ? annotation.getTextAnnotation("DNASample")[0] : annotation.getTextAnnotation("Sample")[0];
            String libPrepID = tax.getName();
            newName += ":" + libPrepID;
            Taxon gbsTaxon = new Taxon.Builder(tax).name(newName).addAnno("Flowcell_Lane", currFlowcellLane)
                    .addAnno("LibraryPrepID", libPrepID).addAnno("Status", "private").build();
            taxaAL.add(gbsTaxon);
        }
        return taxaAL;
    }

    private PositionList getUniquePositions() {
        //todo Move this code to TOPM
        myLogger.info("\nCounting sites in TOPM file");
        PositionListBuilder plb = new PositionListBuilder();
        chromosomes = topm.getChromosomes();
        for (Integer chrNum : chromosomes) {
            Chromosome chr = new Chromosome(chrNum.toString());
            for (int pos : topm.getUniquePositions(topm.getChromosomeIndex(chrNum))) {
                Position p = new GeneralPosition.Builder(chr, pos).build();
                plb.add(p);
            }
        }
        PositionList pl = plb.sortPositions().build();
        myLogger.info("In total, the TOPM contains " + chromosomes.length + " chromosomes and " + pl.numberOfSites() + " sites.");
        return pl;
    }

    private void generateFastSiteLookup(PositionList pl) {
        ImmutableTable.Builder ptsB = new ImmutableTable.Builder();
        for (int i = 0; i < pl.numberOfSites(); i++) {
            Position position = pl.get(i);
            ptsB.put(position.getChromosome().getChromosomeNumber(), position.getPosition(), i);
        }
        positionToSite = ptsB.build();
    }

    private BufferedReader getBufferedReaderForRawSeqFile(int fileNum) {
        BufferedReader br = null;
        try {
            if (myRawSeqFileNames[fileNum].endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(myRawSeqFileNames[fileNum]))));
            } else {
                br = new BufferedReader(new FileReader(myRawSeqFileNames[fileNum]), 65536);
            }
        } catch (Exception e) {
            myLogger.error("Catch in getBufferedReader(): e=" + e);
            e.printStackTrace();
        }
        return br;
    }

    private ReadBarcodeResult readSequenceRead(BufferedReader br, String temp, ParseBarcodeRead thePBR, int[] counters) {
        ReadBarcodeResult rr = null;
        String sl = "";
        try {
            if (fastq) {
                sl = br.readLine();    // read the 2nd line in the set of 4 lines = sequence
                temp = br.readLine();  // skip the 3rd line
                temp = br.readLine();  // skip the 4th in the set of 4 lines = quality score (note that the QS is scaled differently in Cassava 1.8 - we don't use it so it is not corrected here)
                rr = thePBR.parseReadIntoTagAndTaxa(sl, null, true, 0);
            } else {  // qseq
                String[] jj = temp.split("\\s");
                sl = jj[8];
                rr = thePBR.parseReadIntoTagAndTaxa(sl, null, false, 0);
            }
        } catch (Exception e) {
            myLogger.error("Catch in readSequenceRead() at nReads=" + counters[0] + " e=" + e);
            myLogger.error("Last line read:\n" + temp + "\n");
            e.printStackTrace();
        }
        counters[0]++;  // allReads
        return rr;
    }

    private int findBestImperfectMatch(long[] read, int[] counters) {
        // this method is not ready for prime time -- to resolve a tie, it currently chooses a random tag out of the tied tags
        int tagIndex = -1;
        TagMatchFinder tmf = new TagMatchFinder(topm);
//        TreeMap bestHitsAndDiv = tmf.findMatchesWithIntLengthWords(read, maxDivergence(), true);
        TreeMap bestHitsAndDiv = tmf.findMatchesWithIntLengthWords(read, 0, true);
        if (bestHitsAndDiv.size() > 0) {
            counters[4]++; // imperfectMatches
            if (bestHitsAndDiv.size() == 1) {
                counters[5]++; // singleImperfectMatches
            }
            tagIndex = bestHitsAndDiv.firstKey();  // a random tag (firstKey) chosen to resolve the tie = suboptimal behavior
        }
        return tagIndex;
    }

    private void incrementDepthForTagVariants(int tagIndex, int[][] alleleDepths, int increment) {
        int chromosome = topm.getChromosome(tagIndex);
        if (chromosome == TOPMInterface.INT_MISSING) {
            return;
        }
        int startPos = topm.getStartPosition(tagIndex);
        for (int variant = 0; variant < topm.getMaxNumVariants(); variant++) {
            byte newBase = topm.getVariantDef(tagIndex, variant);
            if ((newBase == TOPMInterface.BYTE_MISSING) || (newBase == GenotypeTable.UNKNOWN_ALLELE)) {
                continue;
            }
            int offset = topm.getVariantPosOff(tagIndex, variant);
            int pos = startPos + offset;
//            int currSite = genos.getSiteOfPhysicalPosition(pos, locus);
            int currSite = positionToSite.get(chromosome, pos);
            if (currSite < 0) {
                continue;
            }
            alleleDepths[newBase][currSite] += increment;
        }
    }

    private void callGenotypes() {
        myLogger.info("\nCalling genotypes...");
        for (int currTaxonIndex = 0; currTaxonIndex < obsTagsForEachTaxon.length; currTaxonIndex++) {
            IntArrayList currTagList = obsTagsForEachTaxon[currTaxonIndex];
            currTagList.sort();
            int[][] alleleDepths = new int[NucleotideAlignmentConstants.NUMBER_NUCLEOTIDE_ALLELES][myPositionList.numberOfSites()];
            int prevTag = currTagList.getQuick(0);
            int currInc = 0;
            for (int t = 0; t < currTagList.size(); t++) {
                int tag = currTagList.getQuick(t);
                if (tag == prevTag) {
                    currInc++;
                } else {
                    incrementDepthForTagVariants(prevTag, alleleDepths, currInc);
                    prevTag = tag;
                    currInc = 1;
                }
            }
            incrementDepthForTagVariants(prevTag, alleleDepths, currInc);
            byte[][] byteDepths = AlleleDepthUtil.depthIntToByte(alleleDepths);
            byte[] taxonGenos = resolveGenosForTaxon(byteDepths);
            if (noDepthToOutput()) {
                genos.addTaxon(taxaList.get(currTaxonIndex), taxonGenos, null);
            } else {
                genos.addTaxon(taxaList.get(currTaxonIndex), taxonGenos, byteDepths);
            }
            myLogger.info("  finished calling genotypes for " + taxaList.get(currTaxonIndex).getName());
        }
        myLogger.info("Finished calling genotypes for " + obsTagsForEachTaxon.length + " taxa\n");
    }

    private byte[] resolveGenosForTaxon(byte[][] depthsForTaxon) {
        int nAlleles = depthsForTaxon.length;
        byte[] depthsAtSite = new byte[nAlleles];
        int nSites = depthsForTaxon[0].length;
        byte[] genos = new byte[nSites];
        for (int site = 0; site < nSites; site++) {
            for (int allele = 0; allele < nAlleles; allele++) {
                depthsAtSite[allele] = depthsForTaxon[allele][site];
            }
            genos[site] = genoMergeRule.callBasedOnDepth(depthsAtSite);
        }
        return genos;
    }

    private void writeReadsPerSampleReports() {
        myLogger.info("\nWriting ReadsPerSample log file...");
        String outFileS = myOutputDir + File.separator + (new File(keyFile())).getName();
        outFileS = outFileS.replaceAll(".txt", "_ReadsPerSample.log");
        outFileS = outFileS.replaceAll("_key", "");
        try {
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outFileS))), 65536);
            bw.write("FullSampleName\tgoodBarcodedReads\tgoodReadsMatchedToTOPM\n");
            for (String fullSampleName : rawReadCountsForFullSampleName.keySet()) {
                bw.write(fullSampleName + "\t" + rawReadCountsForFullSampleName.get(fullSampleName) + "\t" + matchedReadCountsForFullSampleName.get(fullSampleName) + "\n");
            }
            bw.close();
        } catch (Exception e) {
            myLogger.error("Couldn't write to ReadsPerSample log file: " + e);
            e.printStackTrace();
            System.exit(1);
        }
        myLogger.info("   ...done\n");
    }

    private void printFileNameConventions(String actualFileName) {
        String message
                = "\n\n"
                + "Error in parsing file name:"
                + "\n   The raw sequence filename does not contain either 3, 4, or 5 underscore-delimited values."
                + "\n   Acceptable file naming conventions include the following (where FLOWCELL indicates the flowcell name and LANE is an integer):"
                + "\n       FLOWCELL_LANE_fastq.gz"
                + "\n       FLOWCELL_s_LANE_fastq.gz"
                + "\n       code_FLOWCELL_s_LANE_fastq.gz"
                + "\n       FLOWCELL_LANE_fastq.txt.gz"
                + "\n       FLOWCELL_s_LANE_fastq.txt.gz"
                + "\n       code_FLOWCELL_s_LANE_fastq.txt.gz"
                + "\n       FLOWCELL_LANE_qseq.txt.gz"
                + "\n       FLOWCELL_s_LANE_qseq.txt.gz"
                + "\n       code_FLOWCELL_s_LANE_qseq.txt.gz"
                + "\n"
                + "\n   Actual Filename: " + actualFileName
                + "\n\n";

        myLogger.error(message);
    }

    public static final String rawSeqFileNameRegex
            = "(?i)" + // case insensitve
            ".*\\.fq" + "$|"
            + ".*\\.fq\\.gz" + "$|"
            + ".*\\.fastq" + "$|"
            + ".*_fastq\\.txt" + "$|"
            + ".*_fastq\\.gz" + "$|"
            + ".*_fastq\\.txt\\.gz" + "$|"
            + ".*_sequence\\.txt" + "$|"
            + ".*_sequence\\.txt\\.gz" + "$|"
            + ".*_qseq\\.txt" + "$|"
            + ".*_qseq\\.txt\\.gz" + "$";
    //                \\. denotes escape . so it doesn't mean 'any char'

    private String noMatchingRawSeqFileNamesMessage
            = "Couldn't find any files that end with "
            + "\".fq\", "
            + "\".fq.gz\", "
            + "\".fastq\", "
            + "\"_fastq.txt\", "
            + "\"_fastq.gz\", "
            + "\"_fastq.txt.gz\", "
            + "\"_sequence.txt\", "
            + "\"_sequence.txt.gz\", "
            + "\"_qseq.txt\", or "
            + "\"_qseq.txt.gz\" "
            + "in the supplied directory: ";

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Production SNP Caller";
    }

    @Override
    public String getToolTipText() {
        return "Production SNP Caller";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(ProductionSNPCallerPlugin.class);
    // }
    /**
     * Input directory containing fastq AND/OR qseq files.
     *
     * @return Input Directory
     */
    public String inputDirectory() {
        return myInputDirectory.value();
    }

    /**
     * Set Input Directory. Input directory containing fastq AND/OR qseq files.
     *
     * @param value Input Directory
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin inputDirectory(String value) {
        myInputDirectory = new PluginParameter<>(myInputDirectory, value);
        return this;
    }

    /**
     * Key file listing barcodes distinguishing the samples
     *
     * @return Key File
     */
    public String keyFile() {
        return myKeyFile.value();
    }

    /**
     * Set Key File. Key file listing barcodes distinguishing the samples
     *
     * @param value Key File
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin keyFile(String value) {
        myKeyFile = new PluginParameter<>(myKeyFile, value);
        return this;
    }

    /**
     * Enzyme used to create the GBS library
     *
     * @return Enzyme
     */
    public String enzyme() {
        return myEnzyme.value();
    }

    /**
     * Set Enzyme. Enzyme used to create the GBS library
     *
     * @param value Enzyme
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin enzyme(String value) {
        myEnzyme = new PluginParameter<>(myEnzyme, value);
        return this;
    }

    /**
     * Physical map file containing tags and corresponding variants (production
     * TOPM)
     *
     * @return Input TOPM File
     */
    public String inputTOPMFile() {
        return myProductionTOPM.value();
    }

    /**
     * Set Input TOPM File. Physical map file containing tags and corresponding
     * variants (production TOPM)
     *
     * @param value Input TOPM File
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin inputTOPMFile(String value) {
        myProductionTOPM = new PluginParameter<>(myProductionTOPM, value);
        return this;
    }

    /**
     * Output (target) HDF5 genotypes file to add new genotypes to (new file
     * created if it doesn't exist)
     *
     * @return Output HDF5 Genotypes File
     */
    public String outputHDF5GenotypesFile() {
        return myOutputGenotypes.value();
    }

    /**
     * Set Output HDF5 Genotypes File. Output (target) HDF5 genotypes file to
     * add new genotypes to (new file created if it doesn't exist)
     *
     * @param value Output HDF5 Genotypes File
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin outputHDF5GenotypesFile(String value) {
        myOutputGenotypes = new PluginParameter<>(myOutputGenotypes, value);
        return this;
    }

    /**
     * Average sequencing error rate per base (used to decide between
     * heterozygous and homozygous calls)
     *
     * @return Ave Seq Error Rate
     */
    public Double aveSeqErrorRate() {
        return myAveSeqErrorRate.value();
    }

    /**
     * Set Ave Seq Error Rate. Average sequencing error rate per base (used to
     * decide between heterozygous and homozygous calls)
     *
     * @param value Ave Seq Error Rate
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin aveSeqErrorRate(Double value) {
        myAveSeqErrorRate = new PluginParameter<>(myAveSeqErrorRate, value);
        return this;
    }

//    /**
//     * Maximum divergence (edit distance) between new read and previously mapped
//     * read (Default: 0 = perfect matches only)
//     *
//     * @return Max Divergence
//     */
//    public Integer maxDivergence() {
//        return myMaxDivergence.value();
//    }
//
//    /**
//     * Set Max Divergence. Maximum divergence (edit distance) between new read
//     * and previously mapped read (Default: 0 = perfect matches only)
//     *
//     * @param value Max Divergence
//     *
//     * @return this plugin
//     */
//    public ProductionSNPCallerPlugin maxDivergence(Integer value) {
//        myMaxDivergence = new PluginParameter<>(myMaxDivergence, value);
//        return this;
//    }

    /**
     * Keep hdf5 genotypes open for future runs that add more taxa or more depth
     *
     * @return Keep Genotypes Open
     */
    public Boolean keepGenotypesOpen() {
        return myKeepGenotypesOpen.value();
    }

    /**
     * Set Keep Genotypes Open. Keep hdf5 genotypes open for future runs that
     * add more taxa or more depth
     *
     * @param value Keep Genotypes Open
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin keepGenotypesOpen(Boolean value) {
        myKeepGenotypesOpen = new PluginParameter<>(myKeepGenotypesOpen, value);
        return this;
    }

    /**
     * No depth output: do not write depths to the output hdf5 genotypes file
     *
     * @return No Depth to Output
     */
    public Boolean noDepthToOutput() {
        return myNoDepthOutput.value();
    }

    /**
     * Set No Depth to Output. No depth output: do not write depths to the
     * output hdf5 genotypes file
     *
     * @param value No Depth to Output
     *
     * @return this plugin
     */
    public ProductionSNPCallerPlugin noDepthToOutput(Boolean value) {
        myNoDepthOutput = new PluginParameter<>(myNoDepthOutput, value);
        return this;
    }

    /**
     * Use STACKS likelihood method to call heterozygotes (default: use
     * tasselGBS likelihood ratio method)
     *
     * @return Use Stacks Likelihood
     */
    //public Boolean useStacksLikelihood() {
    //    return myStacksLikelihood.value();
    //}
    /**
     * Set Use Stacks Likelihood. Use STACKS likelihood method to call
     * heterozygotes (default: use tasselGBS likelihood ratio method)
     *
     * @param value Use Stacks Likelihood
     *
     * @return this plugin
     */
    //public ProductionSNPCallerPlugin useStacksLikelihood(Boolean value) {
    //    myStacksLikelihood = new PluginParameter<>(myStacksLikelihood, value);
    //    return this;
    //}
}