All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.repgen.RepGenLoadSeqToDBPlugin Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
package net.maizegenetics.analysis.gbs.repgen;

import java.awt.Frame;
import java.io.BufferedReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import net.maizegenetics.analysis.gbs.v2.GBSUtils;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.tag.RepGenDataWriter;
import net.maizegenetics.dna.tag.RepGenSQLite;
import net.maizegenetics.dna.tag.Tag;
import net.maizegenetics.dna.tag.TagBuilder;
import net.maizegenetics.dna.tag.TagDataSQLite;
import net.maizegenetics.dna.tag.TaxaDistBuilder;
import net.maizegenetics.dna.tag.TaxaDistribution;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * Develops a discovery rGBS database based on a folder of sequencing files
 *
 * Keeps only good reads having no N's in the
 * useful part of the sequence. Trims off the barcodes and truncates sequences
 * that (1) have a second cut site, or (2) read into the common adapter.
 * 
 * Originally the reference throughout was to "tag". This is being changed
 * to "kmer" as the pipeline is a kmer alignment process.  
 *
 * @author Ed Buckler
 */
public class RepGenLoadSeqToDBPlugin extends AbstractPlugin {

    private static final Logger myLogger = Logger.getLogger(RepGenLoadSeqToDBPlugin.class);

    private PluginParameter myInputDir = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true).inDir()
            .description("Input directory containing FASTQ files in text or gzipped text.\n"
                    + "     NOTE: Directory will be searched recursively and should\n"
                    + "     be written WITHOUT a slash after its name.").build();
    private PluginParameter myKeyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
            .description("Key file listing barcodes distinguishing the samples").build();
    private PluginParameter myKmerLength = new PluginParameter.Builder<>("kmerLength", 150, Integer.class).guiName("Maximum Kmer Length")
            .description("Specified length for each kmer to process").build();
    private PluginParameter myMinKmerLength = new PluginParameter.Builder<>("minKmerL", 120, Integer.class).guiName("Minimum Kmer Length")
            .description("Minimum kmer Length after second cut site is removed").build();
    private PluginParameter myMinKmerCount = new PluginParameter.Builder<>("c", 10, Integer.class).guiName("Min Kmer Count")
            .description("Minimum kmer count").build();
    private PluginParameter minTaxa = new PluginParameter.Builder<>("minTaxa",2, Integer.class).guiName("Min Taxa Represented")
            .description("Minimum numer of taxa where kmer is found").build();
    private PluginParameter myOutputDB = new PluginParameter.Builder<>("db", null, String.class).guiName("Output Database File").required(true).outFile()
            .description("Output Database File").build();
    private PluginParameter myMinQualScore = new PluginParameter.Builder<>("mnQS", 0, Integer.class).guiName("Minimum quality score").required(false)
            .description("Minimum quality score within the barcode and read length to be accepted").build();
    private PluginParameter myMaxKmerNumber = new PluginParameter.Builder<>("mxKmerNum", 50000000, Integer.class).guiName("Maximum Kmer Number").required(false)
            .description("Maximum number of kmers").build();
    private PluginParameter myBatchSize = new PluginParameter.Builder<>("batchSize", 100, Integer.class).guiName("Batch size of fastq files").required(false)
            .description("Number of flow cells being processed simultaneously").build();
//    private PluginParameter myDeleteOldData = new PluginParameter.Builder("deleteOldData",true,Boolean.class).guiName("Delete Old Data")
//            .description("Delete existing SNP quality data from db tables").build();
    LongAdder roughTagCnt = new LongAdder();

    private TagDistributionMap tagCntMap;
    private TagCountQualityScoreMap tagCntQSMap;
    private boolean taglenException;
    protected static int readEndCutSiteRemnantLength;
    protected static int qualityScoreBase;
    String[] likelyReadEndStrings;
    
    public RepGenLoadSeqToDBPlugin() {
        super(null, false);
    }

    public RepGenLoadSeqToDBPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }


    private long[] calcTagMapStats(TagDistributionMap tagCntMap) {
        long totalDepth=0, memory=0;
        int cnt=0;
        for (Map.Entry entry : tagCntMap.entrySet()) {
            memory+=entry.getValue().memorySize();
            memory+=25;
            totalDepth+=entry.getValue().totalDepth();
            cnt++;
        }
        int currentSize = tagCntMap.size();
        memory+=tagCntMap.size()*2*16;  //estimate for the map size
        long[] stats={currentSize,memory, totalDepth,totalDepth/cnt};
        System.out.printf("Map Tags:%,d  Memory:%,d  TotalDepth:%,d  AvgDepthPerTag:%d%n",stats[0],stats[1],stats[2],stats[3]);
        return stats;
    }
    @Override
    public void postProcessParameters() {

    }
    @Override
    public DataSet processData(DataSet input) {
        int batchSize = myBatchSize.value();
        float loadFactor = 0.95f; //TODO: Seems high to Ed
        tagCntMap = new TagDistributionMap (myMaxKmerNumber.value(),loadFactor, 128, this.minKmerCount());
        tagCntQSMap = new TagCountQualityScoreMap(myMaxKmerNumber.value(),loadFactor, 128, this.minKmerCount()); // must add Collections.synchonizedList as the list! 
        try {
            TaxaList masterTaxaList= TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), GBSUtils.sampleNameField, new HashMap<>(), true);
            Map fileTaxaMap=TaxaListIOUtils.getUniqueMapOfTaxonByAnnotation(masterTaxaList,GBSUtils.fileNameField)
                    .orElseThrow(() -> new IllegalArgumentException("Error: Same file points more than one taxon in the KeyFile"));
            //Get the list of fastq files
            List directoryFiles= DirectoryCrawler.listPaths(GBSUtils.inputFileGlob, Paths.get(myInputDir.value()).toAbsolutePath());
            if(directoryFiles.isEmpty()) {
                myLogger.warn("No files matching:"+ GBSUtils.inputFileGlob);
                return null;
            }
            // Cull files that are not represented in the given key file
            List inputSeqFiles =  directoryFiles.stream()
                    .peek(path -> System.out.println(path.getFileName().toString()))
                    .filter(path -> fileTaxaMap.containsKey(path.getFileName().toString()))
                    .collect(Collectors.toList());
            if (inputSeqFiles.size() == 0) {
                System.out.println("RepGenLoadSeqToDB:processData - found NO files represented in key file.");
                System.out.println("Please verify your file names are formatted correctly and that your key file contains the required headers.");
                return null; // no files in this directory to process
            }
            System.out.println("Found " + inputSeqFiles.size() + " files to process");
            //Files in a batch have roughly the same size
            //inputSeqFiles = this.sortFastqBySize(inputSeqFiles);
            int batchNum = inputSeqFiles.size()/batchSize;
            if (inputSeqFiles.size()%batchSize != 0) batchNum++;

            // ALways deleting old DB - need to work on code to re-create
            // array of tag quality scores if we add to old DB.
            RepGenDataWriter tdw = null;
            if (Files.exists(Paths.get(myOutputDB.value()))) {
                try {
                    Files.delete(Paths.get(myOutputDB.value()));
                } catch (Exception exc){
                    System.out.println("Error when trying to delete database file: " + myOutputDB.value());
                    System.out.println("File delete error: " + exc.getMessage());
                    return null;
                }
            }

//                if (deleteOldData()) {
// 
//                }
//                } else {
//                 // We'll append data to new DB if all new taxa are contained in db
//                    tdw=new RepGenSQLite(myOutputDB.value());
//                    TaxaList oldTaxaList = tdw.getTaxaList();
//                    boolean sameMasterTaxaList = true;
//                    Taxon badTaxon = null;
//                    for (Taxon taxa : masterTaxaList) {
//                        if (!oldTaxaList.contains(taxa)) {
//                            sameMasterTaxaList = false;
//                            badTaxon = taxa;
//                            break;
//                        }
//                    }
//                    if (!sameMasterTaxaList) {
//                        myLogger.error("Taxa list in keyfile contains taxa not in db:  " + badTaxon +
//                            ".\nEither delete existing DB, use a new DB output file, or use keyfile with entries matching existing taxa list.\n");
//                        ((TagDataSQLite)tdw).close();
//                        return null;
//                    }
//                    // Grab existing data from db, append to empty tagCntMap
//                    Map existingTDM = tdw.getAllTagsTaxaMap();
//                    tagCntMap.putAll(existingTDM);
//                    tdw.clearTagTaxaDistributionData(); // clear old data - it will be re-added at the end.
//                }
//            }

            if (tdw == null) tdw=new RepGenSQLite(myOutputDB.value());
            taglenException = false;
            for (int i = 0; i < inputSeqFiles.size(); i+=batchSize) {
                int end = i+batchSize;
                if (end > inputSeqFiles.size()) end = inputSeqFiles.size();
                ArrayList sub = new ArrayList();
                for (int j = i; j < end; j++) sub.add(inputSeqFiles.get(j));
                System.out.println("\nStart processing batch " + String.valueOf(i/batchSize+1));
                sub//.parallelStream()
                    .forEach(inputSeqFile -> {
                        try {
                            int taxaIndex=masterTaxaList.indexOf(fileTaxaMap.get(inputSeqFile.getFileName().toString()));
                            processFastQ(inputSeqFile, taxaIndex, masterTaxaList, tagCntMap, kmerLength(), minimumQualityScore());
                        } catch (StringIndexOutOfBoundsException oobe) {
                            oobe.printStackTrace();
                            myLogger.error(oobe.getMessage());
                            setTagLenException();
                            return;
                        }
                });
                if (taglenException == true) {
                    ((TagDataSQLite)tdw).close();
                    return null; // Tag length failure from processFastQ - halt processing
                }

                System.out.println("\nKmers are added from batch "+String.valueOf(i/batchSize+1) + ". Total batch number: " + batchNum);
                int currentSize = tagCntMap.size();
                System.out.println("Current number: " + String.valueOf(currentSize) + ". Max kmer number: " + String.valueOf(myMaxKmerNumber.value()));
                System.out.println(String.valueOf((float)currentSize/(float)myMaxKmerNumber.value()) + " of max tag number");

                if (currentSize > 0) { // calcTagMapStats() gets "divide by 0" error when size == 0
                    this.calcTagMapStats(tagCntMap);
                    System.out.println();
                    //make sure don't lose rare ones, need to set maxTagNumber large enough
                    System.out.println("BEFORE removeTagsWihtoutReplication, tagCntMap.size= " + tagCntMap.keySet().size()
                      + ", tagCntQSMap.size= " + tagCntQSMap.keySet().size());
                    removeTagsWithoutReplication(tagCntMap,tagCntQSMap,2,4);  //TODO lows values that keep the map under control
                    if (tagCntMap.size() == 0) {
                        System.out.println("WARNING:  After removing tags without replication, there are NO  tags left in the database");
                    } else {
                        this.calcTagMapStats(tagCntMap);
                        System.out.println();
                        System.out.println("After removeTagsWithoutReplication: Kmer number is reduced to " + tagCntMap.size()+"\n");
                    }
                    this.roughTagCnt.reset();
                    this.roughTagCnt.add(tagCntMap.size());
                } else {
                    System.out.println("WARNING: Current tagcntmap size is 0 after processing batch " + String.valueOf(i/batchSize+1) );
                }

                System.out.println("Total memory: "+ String.valueOf((double)(Runtime.getRuntime().totalMemory()/1024/1024/1024))+" Gb");
                System.out.println("Free memory: "+ String.valueOf((double)(Runtime.getRuntime().freeMemory()/1024/1024/1024))+" Gb");
                System.out.println("Max memory: "+ String.valueOf((double)(Runtime.getRuntime().maxMemory()/1024/1024/1024))+" Gb");
                System.out.println("\n");
            }
            System.out.println("\nAll the batch are processed");
            removeTagsWithoutReplication(tagCntMap,tagCntQSMap,minTaxa(),minKmerCount());
//            tagCntMap.removeTagByCount(myMinKmerCount.value());
//            tagCntQSMap.removeTagByCount(myMinKmerCount.value()); // should remove the same tags as in call above
           // System.out.println("By removing kmers with minCount of " + myMinKmerCount.value() + "Kmer number is reduced to " + tagCntMap.size()+"\n");
            System.out.printf("Filter by kmerCnt:%d taxaCount:%d results in MapSize:%d\n",minKmerCount(), minTaxa(), tagCntMap.size());

            // Now create map of count/qualityString
            // THis must have the same number of keys as does tagCntMap.
            
            if (tagCntMap.keySet().size() != tagCntQSMap.keySet().size()) {
                System.out.println("Mismatch between size of tagCntMap " + tagCntMap.keySet().size() 
                        + ", and tagCntQSMap " + tagCntQSMap.keySet().size());
                System.out.println("quitting - nothing added to DB!");
                ((RepGenSQLite)tdw).close();
                return null;    
            } else {
                System.out.println("\ntagCntMap and tagCntQSMap have same size: " + tagCntMap.keySet().size());
            }
            
            Map> tagInstanceAverageQS = calculateTagAveQS(tagCntQSMap,qualityScoreBase);
            System.out.println("Before add to DB: sizeof tagInstanceAverageQS.keySet = " + tagInstanceAverageQS.keySet().size());
            tdw.putTaxaList(masterTaxaList);
            // 2 fields to putALlTag as some callers of this method don't hvae the qs/num-instances data
            //tdw.putAllTag(tagCntMap.keySet(), null);
            tdw.putAllTag(tagCntMap.keySet(),tagInstanceAverageQS); // includes tag, the number of instances of this tag, and the average quality string
            tdw.putTaxaDistribution(tagCntMap);
            ((RepGenSQLite)tdw).close();  //todo autocloseable should do this but it is not working.
        } catch(Exception e) {
            e.printStackTrace();
        }
        return new DataSet(new Datum("TagMap",tagCntMap,""),this);
    }

    private void processFastQ(Path fastqFile, int taxaIndex, TaxaList masterTaxaList,
                              TagDistributionMap masterTagTaxaMap, int preferredTagLength, int minQual) throws StringIndexOutOfBoundsException{
        int allReads=0, goodBarcodedReads = 0, lowQualityReads = 0, tooShortReads = 0;
        int maxTaxaNumber=masterTaxaList.size();
        int checkSize = 10000000;
        myLogger.info("processing file " + fastqFile.toString());
        try {
            //int qualityScoreBase=GBSUtils.determineQualityScoreBase(fastqFile);
            qualityScoreBase=GBSUtils.determineQualityScoreBase(fastqFile);
            BufferedReader br = Utils.getBufferedReader(fastqFile.toString(), 1 << 22);
            long time=System.nanoTime();
            String[] seqAndQual;

            while ((seqAndQual=GBSUtils.readFastQBlock(br,allReads)) != null) {
                allReads++;
                int tagEnd = seqAndQual[0].length() < preferredTagLength ? seqAndQual[0].length() : preferredTagLength;
                // Check for and toss low quality reads
                if(minQual>0) {
                    //todo move getFirstLowQualityPos into this class?
                    if(BaseEncoder.getFirstLowQualityPos(seqAndQual[1],minQual, qualityScoreBase)<(tagEnd)){
                    	lowQualityReads++;
                    	continue;
                    }
                }

                // preferredTagLength is actually maximum tag length.  We need to check if 
                // tag is less than minimumKmerLength, toss that.  And truncate if is greater
                // than kmerLength.
                if (seqAndQual[0].length() < minimumKmerLength()) {
                //if (seqAndQual[0].length() < preferredTagLength) {
                    //System.out.println("Tossing kmer, length too short: " + seqAndQual[0].length());
//                	String errMsg = "\n\nERROR processing " + fastqFile.toString() + "\n" +
//                			"Reading entry number " + allReads + " fails the length test.\n" +
//                			"Sequence length " + seqAndQual[0].length() +
//                			" is less then maxKmerLength " + minimumKmerLength() + ".\n" +
//                			"Re-run your files with either a shorter mxKmerL value or a higher minimum quality score.\n";
                	//throw new StringIndexOutOfBoundsException(errMsg);
                    tooShortReads++;
                	continue; // for combined reads, just continue, don't error out.
                }
                //Tag tag = TagBuilder.instance(seqAndQual[0].substring(0,preferredTagLength)).build();

                Tag tag = TagBuilder.instance(seqAndQual[0].substring(0,tagEnd)).build();
                

                if(tag==null) continue;   //null occurs when any base was not A, C, G, T
                goodBarcodedReads++;
                TaxaDistribution taxaDistribution=masterTagTaxaMap.get(tag);
                if(taxaDistribution==null) {
                    masterTagTaxaMap.put(tag,TaxaDistBuilder.create(maxTaxaNumber,taxaIndex));
                    this.roughTagCnt.increment();
                } else {
                    taxaDistribution.increment(taxaIndex);
                }
                // add to qualityScoreMap
               // String tagQS = seqAndQual[1].substring(0,preferredTagLength);
                String tagQS = seqAndQual[1].substring(0,tagEnd);
                List tagScores = tagCntQSMap.get(tag);
                if (tagScores==null) {
                    // create new synchronizedList
                    tagScores =  Collections.synchronizedList(new ArrayList());
                    tagScores.add(tagQS);
                    tagCntQSMap.put(tag, tagScores);
                } else {
                    tagScores.add(tagQS);
                }
                if (allReads % checkSize == 0) {
                    myLogger.info("Total Reads:" + allReads + " Reads with barcode and cut site overhang:" + goodBarcodedReads
                            + " rate:" + (System.nanoTime()-time)/allReads +" ns/read. Current tag count:" + this.roughTagCnt);
                }
            }
            myLogger.info("Summary for "+fastqFile.toString()+"\n"+
                    "Total number of reads in lane=" + allReads +"\n"+
                    "Total number of good barcoded reads=" + goodBarcodedReads+"\n"+
                    "Total number of low quality reads=" + lowQualityReads+"\n"+
                    "Total number of too short reads=" + tooShortReads + "\n"+
                    "Timing process (sorting, collapsing, and writing TagCount to file)."+"\n"+
                    "Process took " + (System.nanoTime() - time)/1e6 + " milliseconds.");
            System.out.println("tagCntMap size: "+masterTagTaxaMap.size());
            br.close();
        } catch (StringIndexOutOfBoundsException oobe) {
        	throw oobe; // pass it up to print error and stop processing
        } catch (Exception e) {
            myLogger.error("Good Barcodes Read: " + goodBarcodedReads);

            e.printStackTrace();
        }
    }


    /**
     * This method removes all tags are are never repeated in a single sample (taxa).  The concept is that
     * all biologically real tag should show up twice somewhere.  This could be called at the end of every
     * flowcell to test all the novel tags.
     * 
     * In addition, it removes the corresponding entry from the tag-qualityscore map.  They must remain in synch
     */
    private static void removeTagsWithoutReplication (TagDistributionMap masterTagTaxaMap, TagCountQualityScoreMap tagCntQSMap,int minTaxa, int minDepth) {
        int currentSize = masterTagTaxaMap.size();
       // int minTaxa=2;
        System.out.println("Starting removeTagsWithoutReplication. Current tag number: " + currentSize);
        LongAdder tagsRemoved=new LongAdder();
        masterTagTaxaMap.entrySet().parallelStream().forEach(t -> {
            TaxaDistribution td = t.getValue();
            if(td.totalDepth() depth>1).count()> calculateTagAveQS(TagCountQualityScoreMap tagCntQSMap, int qualityScoreBase) {
        
        Map> tagInstanceAverageQS = new ConcurrentHashMap>();
        
        // the number of instances for each tag equals the number of values on the arraylist
        // iterate through the list, add values to the tagInstanceAverageQS map
        tagCntQSMap.entrySet().parallelStream().forEach(entry -> {
            Tag tag = entry.getKey();
            List scores = entry.getValue();
            // add all the scores, divide by number of scores, convert back to string
            int numInstances = scores.size();
            int scoreLen = scores.get(0).length();
            // the scores should all be the same length as the tag
            int[] scoreArray = new int[scoreLen];
            
            for (int idx = 0; idx < numInstances; idx++) {
                String currentScore = scores.get(idx);
                for (int jdx = 0; jdx < scoreLen; jdx++) {
                    scoreArray[jdx] += (currentScore.charAt(jdx) - qualityScoreBase);
                }
            }
            // positions for all scores are added.  now divide by number of scores to get
            // the average, translate value back to string
            StringBuilder sb = new StringBuilder();
            for (int jdx = 0; jdx < scoreLen; jdx++) {
                scoreArray[jdx] /= numInstances;
                char ch = (char)(scoreArray[jdx] + qualityScoreBase);
                sb.append(ch);
            }
            // Store to map
            Tuple instanceScore = new Tuple(numInstances,sb.toString());
            tagInstanceAverageQS.put(tag, instanceScore);           
        });
        
        return tagInstanceAverageQS;
    }

// The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
//     public static void main(String[] args) {
//         GeneratePluginCode.generate(RepGenLoadSeqToDBPlugin.class);
//     }

    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
//    public  runPlugin(DataSet input) {
//        return () performFunction(input).getData(0).getData();
//    }

    /**
     * Input directory containing FASTQ files in text or gzipped
     * text.
     *      NOTE: Directory will be searched recursively and
     * should
     *      be written WITHOUT a slash after its name.
     *
     * @return Input Directory
     */
    public String inputDirectory() {
        return myInputDir.value();
    }

    /**
     * Set Input Directory. Input directory containing FASTQ
     * files in text or gzipped text.
     *      NOTE: Directory will be searched recursively and
     * should
     *      be written WITHOUT a slash after its name.
     *
     * @param value Input Directory
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin inputDirectory(String value) {
        myInputDir = new PluginParameter<>(myInputDir, value);
        return this;
    }

    /**
     * Key file listing barcodes distinguishing the samples
     *
     * @return Key File
     */
    public String keyFile() {
        return myKeyFile.value();
    }

    /**
     * Set Key File. Key file listing barcodes distinguishing
     * the samples
     *
     * @param value Key File
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin keyFile(String value) {
        myKeyFile = new PluginParameter<>(myKeyFile, value);
        return this;
    }


    /**
     * Maximum Tag Length
     *
     * @return Maximum Tag Length
     */
    public Integer kmerLength() {
        return myKmerLength.value();
    }

    /**
     * Set Maximum Tag Length. Maximum Tag Length
     *
     * @param value Maximum Tag Length
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin kmerLength(Integer value) {
        myKmerLength = new PluginParameter<>(myKmerLength, value);
        return this;
    }

    /**
     * Minimum Tag Length
     *
     * @return Minimum Tag Length
     */
    public Integer minimumKmerLength() {
        return myMinKmerLength.value();
    }

    /**
     * Set Minimum Tag Length. Minimum Tag Length
     *
     * @param value Minimum Tag Length
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin minimumKmerLength(Integer value) {
        myMinKmerLength = new PluginParameter<>(myMinKmerLength, value);
        return this;
    }

    /**
     * Minimum tag count
     *
     * @return Min Tag Count
     */
    public Integer minKmerCount() {
        return myMinKmerCount.value();
    }

    /**
     * Set Min Tag Count. Minimum tag count
     *
     * @param value Min Tag Count
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin minKmerCount(Integer value) {
        myMinKmerCount = new PluginParameter<>(myMinKmerCount, value);
        return this;
    }

    /**
     * Minimum taxa containing tag
     *
     * @return Min Taxa Count
     */
    public Integer minTaxa() {
        return minTaxa.value();
    }

    /**
     * Set Min Taxa Count. Minimum Taxa count
     *
     * @param value Min Taxa containing a tag
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin minTaxa(Integer value) {
        minTaxa = new PluginParameter<>(minTaxa, value);
        return this;
    }
    /**
     * Output Database File
     *
     * @return Output Database File
     */
    public String outputDatabaseFile() {
        return myOutputDB.value();
    }

    /**
     * Set Output Database File. Output Database File
     *
     * @param value Output Database File
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin outputDatabaseFile(String value) {
        myOutputDB = new PluginParameter<>(myOutputDB, value);
        return this;
    }

    /**
     * Minimum quality score within the barcode and read length
     * to be accepted
     *
     * @return Minimum quality score
     */
    public Integer minimumQualityScore() {
        return myMinQualScore.value();
    }

    /**
     * Set Minimum quality score. Minimum quality score within
     * the barcode and read length to be accepted
     *
     * @param value Minimum quality score
     *
     * @return this plugin
     */
    public RepGenLoadSeqToDBPlugin minimumQualityScore(Integer value) {
        myMinQualScore = new PluginParameter<>(myMinQualScore, value);
        return this;
    }

    /**
     * Set maximum number of kmers
     * @param value
     * @return
     */
    public RepGenLoadSeqToDBPlugin maximumKmerNumber(Integer value) {
        myMaxKmerNumber = new PluginParameter<>(myMaxKmerNumber, value);
        return this;
    }

    /**
     * Set number of Fastq files processed simultaneously
     * @param value
     * @return
     */
    public RepGenLoadSeqToDBPlugin batchSize(Integer value) {
        myBatchSize = new PluginParameter<>(myBatchSize, value);
        return this;
    }
//    /**
//     * Delete exisiting  DB
//     *
//     * @return deleteOldData
//     */
//    public Boolean deleteOldData() {
//        return myDeleteOldData.value();
//    }
//
//    /**
//     * Set Delete old data flag.  True indicates we want the
//     * db tables cleared
//     *
//     * @param value true/false - whether to delete data
//     *
//     * @return this plugin
//     */
//    public RepGenLoadSeqToDBPlugin deleteOldData(Boolean value) {
//        myDeleteOldData = new PluginParameter<>(myDeleteOldData, value);
//        return this;
//    }
    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Discovery Tags By Taxa";
    }

    @Override
    public String getToolTipText() {
        return "Discovery Tags By Taxa";
    }

    /**
     * This ConcurrentHashMap constrain the size of the map, and purges low distribution count tags when the size needs
     * to be reduced.
     */
    static class TagDistributionMap extends ConcurrentHashMap {
        private final int maxTagNum;
        private int minDepthToRetainInMap=2;
        private final int minCount;

        TagDistributionMap (int maxTagNumber, float loadFactor, int concurrencyLevel, int minCount) {
            super((maxTagNumber*2), loadFactor, concurrencyLevel);
            maxTagNum = maxTagNumber;
            this.minCount = minCount;
        }

        @Override
        public TaxaDistribution put(Tag key, TaxaDistribution value) {
            return super.put(key, value);
        }

        public synchronized void removeTagByCount(int minCnt) {
            entrySet().parallelStream()
                    .filter(e -> e.getValue().totalDepth() remove(e.getKey()));
        }


        public long estimateMapMemorySize() {
            long size=0;
            int cnt=0;
            for (Entry entry : entrySet()) {
                size+=8+16+1; //Tag size
                size+=16; //Map references
                size+=entry.getValue().memorySize();
                cnt++;
                if(cnt>10000) break;
        }
            long estSize=(size()/cnt)*size;
            return estSize;
        }

        public long[] depthDistribution() {
            long[] base2bins=new long[34];
            int cnt=0;
            for (Entry entry : entrySet()) {
                base2bins[31-Integer.numberOfLeadingZeros(entry.getValue().totalDepth())]++;
                cnt++;
               // if(cnt>100000) break;
            }
            return base2bins;
        }
    }
    
    // LCJ - THe easiest and fastest method is to keep track of all the
    // quality strings seen for each iteration of the tag.  SOme of these
    // tags will be dropped when we removeTagWIthoutReplication() and other
    // methods.  Those need to be synced up with this list.
    
    //If we create a hashmap of tag/list-of-quality-strings, then after
    // culling the list at the end, we can create the average score.  The
    // number of strings on the list give us the number of tag instances.
    
    // THe map itself is a concurrent hashmap, but the list also must
    // be concurrently.  It can be defined as "List" but the value
    // added must be a synchonizedList
    
    // the list that is added needs to be a synchonized list.  
    // http://stackoverflow.com/questions/5923405/concurrenthashmap-with-arraylist-as-value
    // Cannot make this a multimap as multimap will not store duplicate quality scores,
    public static class TagCountQualityScoreMap extends ConcurrentHashMap> {
        private  int tagCount;

        TagCountQualityScoreMap (int maxTagNumber, float loadFactor, int concurrencyLevel, int minCount) {
            super((maxTagNumber*2), loadFactor, concurrencyLevel);

        }

        public synchronized void removeTagByCount(int minCnt) {
            // remove tags which are not represented a minimum number of times
            // THis is determimed by how many quality strings occur for the tag
            // should be consistent with removeTagByCount for TagDistributionMap
            entrySet().parallelStream()
                    .filter(e -> e.getValue().size()  remove(e.getKey()));
        }
    }
    
}






© 2015 - 2025 Weber Informatics LLC | Privacy Policy