
net.maizegenetics.analysis.gbs.repgen.RepGenLoadSeqToDBPlugin Maven / Gradle / Ivy
package net.maizegenetics.analysis.gbs.repgen;
import java.awt.Frame;
import java.io.BufferedReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import net.maizegenetics.analysis.gbs.v2.GBSUtils;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.tag.RepGenDataWriter;
import net.maizegenetics.dna.tag.RepGenSQLite;
import net.maizegenetics.dna.tag.Tag;
import net.maizegenetics.dna.tag.TagBuilder;
import net.maizegenetics.dna.tag.TagDataSQLite;
import net.maizegenetics.dna.tag.TaxaDistBuilder;
import net.maizegenetics.dna.tag.TaxaDistribution;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* Develops a discovery rGBS database based on a folder of sequencing files
*
* Keeps only good reads having no N's in the
* useful part of the sequence. Trims off the barcodes and truncates sequences
* that (1) have a second cut site, or (2) read into the common adapter.
*
* Originally the reference throughout was to "tag". This is being changed
* to "kmer" as the pipeline is a kmer alignment process.
*
* @author Ed Buckler
*/
public class RepGenLoadSeqToDBPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(RepGenLoadSeqToDBPlugin.class);
private PluginParameter myInputDir = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true).inDir()
.description("Input directory containing FASTQ files in text or gzipped text.\n"
+ " NOTE: Directory will be searched recursively and should\n"
+ " be written WITHOUT a slash after its name.").build();
private PluginParameter myKeyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
.description("Key file listing barcodes distinguishing the samples").build();
private PluginParameter myKmerLength = new PluginParameter.Builder<>("kmerLength", 150, Integer.class).guiName("Maximum Kmer Length")
.description("Specified length for each kmer to process").build();
private PluginParameter myMinKmerLength = new PluginParameter.Builder<>("minKmerL", 120, Integer.class).guiName("Minimum Kmer Length")
.description("Minimum kmer Length after second cut site is removed").build();
private PluginParameter myMinKmerCount = new PluginParameter.Builder<>("c", 10, Integer.class).guiName("Min Kmer Count")
.description("Minimum kmer count").build();
private PluginParameter minTaxa = new PluginParameter.Builder<>("minTaxa",2, Integer.class).guiName("Min Taxa Represented")
.description("Minimum numer of taxa where kmer is found").build();
private PluginParameter myOutputDB = new PluginParameter.Builder<>("db", null, String.class).guiName("Output Database File").required(true).outFile()
.description("Output Database File").build();
private PluginParameter myMinQualScore = new PluginParameter.Builder<>("mnQS", 0, Integer.class).guiName("Minimum quality score").required(false)
.description("Minimum quality score within the barcode and read length to be accepted").build();
private PluginParameter myMaxKmerNumber = new PluginParameter.Builder<>("mxKmerNum", 50000000, Integer.class).guiName("Maximum Kmer Number").required(false)
.description("Maximum number of kmers").build();
private PluginParameter myBatchSize = new PluginParameter.Builder<>("batchSize", 100, Integer.class).guiName("Batch size of fastq files").required(false)
.description("Number of flow cells being processed simultaneously").build();
// private PluginParameter myDeleteOldData = new PluginParameter.Builder("deleteOldData",true,Boolean.class).guiName("Delete Old Data")
// .description("Delete existing SNP quality data from db tables").build();
LongAdder roughTagCnt = new LongAdder();
private TagDistributionMap tagCntMap;
private TagCountQualityScoreMap tagCntQSMap;
private boolean taglenException;
protected static int readEndCutSiteRemnantLength;
protected static int qualityScoreBase;
String[] likelyReadEndStrings;
public RepGenLoadSeqToDBPlugin() {
super(null, false);
}
public RepGenLoadSeqToDBPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
private long[] calcTagMapStats(TagDistributionMap tagCntMap) {
long totalDepth=0, memory=0;
int cnt=0;
for (Map.Entry entry : tagCntMap.entrySet()) {
memory+=entry.getValue().memorySize();
memory+=25;
totalDepth+=entry.getValue().totalDepth();
cnt++;
}
int currentSize = tagCntMap.size();
memory+=tagCntMap.size()*2*16; //estimate for the map size
long[] stats={currentSize,memory, totalDepth,totalDepth/cnt};
System.out.printf("Map Tags:%,d Memory:%,d TotalDepth:%,d AvgDepthPerTag:%d%n",stats[0],stats[1],stats[2],stats[3]);
return stats;
}
@Override
public void postProcessParameters() {
}
@Override
public DataSet processData(DataSet input) {
int batchSize = myBatchSize.value();
float loadFactor = 0.95f; //TODO: Seems high to Ed
tagCntMap = new TagDistributionMap (myMaxKmerNumber.value(),loadFactor, 128, this.minKmerCount());
tagCntQSMap = new TagCountQualityScoreMap(myMaxKmerNumber.value(),loadFactor, 128, this.minKmerCount()); // must add Collections.synchonizedList as the list!
try {
TaxaList masterTaxaList= TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), GBSUtils.sampleNameField, new HashMap<>(), true);
Map fileTaxaMap=TaxaListIOUtils.getUniqueMapOfTaxonByAnnotation(masterTaxaList,GBSUtils.fileNameField)
.orElseThrow(() -> new IllegalArgumentException("Error: Same file points more than one taxon in the KeyFile"));
//Get the list of fastq files
List directoryFiles= DirectoryCrawler.listPaths(GBSUtils.inputFileGlob, Paths.get(myInputDir.value()).toAbsolutePath());
if(directoryFiles.isEmpty()) {
myLogger.warn("No files matching:"+ GBSUtils.inputFileGlob);
return null;
}
// Cull files that are not represented in the given key file
List inputSeqFiles = directoryFiles.stream()
.peek(path -> System.out.println(path.getFileName().toString()))
.filter(path -> fileTaxaMap.containsKey(path.getFileName().toString()))
.collect(Collectors.toList());
if (inputSeqFiles.size() == 0) {
System.out.println("RepGenLoadSeqToDB:processData - found NO files represented in key file.");
System.out.println("Please verify your file names are formatted correctly and that your key file contains the required headers.");
return null; // no files in this directory to process
}
System.out.println("Found " + inputSeqFiles.size() + " files to process");
//Files in a batch have roughly the same size
//inputSeqFiles = this.sortFastqBySize(inputSeqFiles);
int batchNum = inputSeqFiles.size()/batchSize;
if (inputSeqFiles.size()%batchSize != 0) batchNum++;
// ALways deleting old DB - need to work on code to re-create
// array of tag quality scores if we add to old DB.
RepGenDataWriter tdw = null;
if (Files.exists(Paths.get(myOutputDB.value()))) {
try {
Files.delete(Paths.get(myOutputDB.value()));
} catch (Exception exc){
System.out.println("Error when trying to delete database file: " + myOutputDB.value());
System.out.println("File delete error: " + exc.getMessage());
return null;
}
}
// if (deleteOldData()) {
//
// }
// } else {
// // We'll append data to new DB if all new taxa are contained in db
// tdw=new RepGenSQLite(myOutputDB.value());
// TaxaList oldTaxaList = tdw.getTaxaList();
// boolean sameMasterTaxaList = true;
// Taxon badTaxon = null;
// for (Taxon taxa : masterTaxaList) {
// if (!oldTaxaList.contains(taxa)) {
// sameMasterTaxaList = false;
// badTaxon = taxa;
// break;
// }
// }
// if (!sameMasterTaxaList) {
// myLogger.error("Taxa list in keyfile contains taxa not in db: " + badTaxon +
// ".\nEither delete existing DB, use a new DB output file, or use keyfile with entries matching existing taxa list.\n");
// ((TagDataSQLite)tdw).close();
// return null;
// }
// // Grab existing data from db, append to empty tagCntMap
// Map existingTDM = tdw.getAllTagsTaxaMap();
// tagCntMap.putAll(existingTDM);
// tdw.clearTagTaxaDistributionData(); // clear old data - it will be re-added at the end.
// }
// }
if (tdw == null) tdw=new RepGenSQLite(myOutputDB.value());
taglenException = false;
for (int i = 0; i < inputSeqFiles.size(); i+=batchSize) {
int end = i+batchSize;
if (end > inputSeqFiles.size()) end = inputSeqFiles.size();
ArrayList sub = new ArrayList();
for (int j = i; j < end; j++) sub.add(inputSeqFiles.get(j));
System.out.println("\nStart processing batch " + String.valueOf(i/batchSize+1));
sub//.parallelStream()
.forEach(inputSeqFile -> {
try {
int taxaIndex=masterTaxaList.indexOf(fileTaxaMap.get(inputSeqFile.getFileName().toString()));
processFastQ(inputSeqFile, taxaIndex, masterTaxaList, tagCntMap, kmerLength(), minimumQualityScore());
} catch (StringIndexOutOfBoundsException oobe) {
oobe.printStackTrace();
myLogger.error(oobe.getMessage());
setTagLenException();
return;
}
});
if (taglenException == true) {
((TagDataSQLite)tdw).close();
return null; // Tag length failure from processFastQ - halt processing
}
System.out.println("\nKmers are added from batch "+String.valueOf(i/batchSize+1) + ". Total batch number: " + batchNum);
int currentSize = tagCntMap.size();
System.out.println("Current number: " + String.valueOf(currentSize) + ". Max kmer number: " + String.valueOf(myMaxKmerNumber.value()));
System.out.println(String.valueOf((float)currentSize/(float)myMaxKmerNumber.value()) + " of max tag number");
if (currentSize > 0) { // calcTagMapStats() gets "divide by 0" error when size == 0
this.calcTagMapStats(tagCntMap);
System.out.println();
//make sure don't lose rare ones, need to set maxTagNumber large enough
System.out.println("BEFORE removeTagsWihtoutReplication, tagCntMap.size= " + tagCntMap.keySet().size()
+ ", tagCntQSMap.size= " + tagCntQSMap.keySet().size());
removeTagsWithoutReplication(tagCntMap,tagCntQSMap,2,4); //TODO lows values that keep the map under control
if (tagCntMap.size() == 0) {
System.out.println("WARNING: After removing tags without replication, there are NO tags left in the database");
} else {
this.calcTagMapStats(tagCntMap);
System.out.println();
System.out.println("After removeTagsWithoutReplication: Kmer number is reduced to " + tagCntMap.size()+"\n");
}
this.roughTagCnt.reset();
this.roughTagCnt.add(tagCntMap.size());
} else {
System.out.println("WARNING: Current tagcntmap size is 0 after processing batch " + String.valueOf(i/batchSize+1) );
}
System.out.println("Total memory: "+ String.valueOf((double)(Runtime.getRuntime().totalMemory()/1024/1024/1024))+" Gb");
System.out.println("Free memory: "+ String.valueOf((double)(Runtime.getRuntime().freeMemory()/1024/1024/1024))+" Gb");
System.out.println("Max memory: "+ String.valueOf((double)(Runtime.getRuntime().maxMemory()/1024/1024/1024))+" Gb");
System.out.println("\n");
}
System.out.println("\nAll the batch are processed");
removeTagsWithoutReplication(tagCntMap,tagCntQSMap,minTaxa(),minKmerCount());
// tagCntMap.removeTagByCount(myMinKmerCount.value());
// tagCntQSMap.removeTagByCount(myMinKmerCount.value()); // should remove the same tags as in call above
// System.out.println("By removing kmers with minCount of " + myMinKmerCount.value() + "Kmer number is reduced to " + tagCntMap.size()+"\n");
System.out.printf("Filter by kmerCnt:%d taxaCount:%d results in MapSize:%d\n",minKmerCount(), minTaxa(), tagCntMap.size());
// Now create map of count/qualityString
// THis must have the same number of keys as does tagCntMap.
if (tagCntMap.keySet().size() != tagCntQSMap.keySet().size()) {
System.out.println("Mismatch between size of tagCntMap " + tagCntMap.keySet().size()
+ ", and tagCntQSMap " + tagCntQSMap.keySet().size());
System.out.println("quitting - nothing added to DB!");
((RepGenSQLite)tdw).close();
return null;
} else {
System.out.println("\ntagCntMap and tagCntQSMap have same size: " + tagCntMap.keySet().size());
}
Map> tagInstanceAverageQS = calculateTagAveQS(tagCntQSMap,qualityScoreBase);
System.out.println("Before add to DB: sizeof tagInstanceAverageQS.keySet = " + tagInstanceAverageQS.keySet().size());
tdw.putTaxaList(masterTaxaList);
// 2 fields to putALlTag as some callers of this method don't hvae the qs/num-instances data
//tdw.putAllTag(tagCntMap.keySet(), null);
tdw.putAllTag(tagCntMap.keySet(),tagInstanceAverageQS); // includes tag, the number of instances of this tag, and the average quality string
tdw.putTaxaDistribution(tagCntMap);
((RepGenSQLite)tdw).close(); //todo autocloseable should do this but it is not working.
} catch(Exception e) {
e.printStackTrace();
}
return new DataSet(new Datum("TagMap",tagCntMap,""),this);
}
private void processFastQ(Path fastqFile, int taxaIndex, TaxaList masterTaxaList,
TagDistributionMap masterTagTaxaMap, int preferredTagLength, int minQual) throws StringIndexOutOfBoundsException{
int allReads=0, goodBarcodedReads = 0, lowQualityReads = 0, tooShortReads = 0;
int maxTaxaNumber=masterTaxaList.size();
int checkSize = 10000000;
myLogger.info("processing file " + fastqFile.toString());
try {
//int qualityScoreBase=GBSUtils.determineQualityScoreBase(fastqFile);
qualityScoreBase=GBSUtils.determineQualityScoreBase(fastqFile);
BufferedReader br = Utils.getBufferedReader(fastqFile.toString(), 1 << 22);
long time=System.nanoTime();
String[] seqAndQual;
while ((seqAndQual=GBSUtils.readFastQBlock(br,allReads)) != null) {
allReads++;
int tagEnd = seqAndQual[0].length() < preferredTagLength ? seqAndQual[0].length() : preferredTagLength;
// Check for and toss low quality reads
if(minQual>0) {
//todo move getFirstLowQualityPos into this class?
if(BaseEncoder.getFirstLowQualityPos(seqAndQual[1],minQual, qualityScoreBase)<(tagEnd)){
lowQualityReads++;
continue;
}
}
// preferredTagLength is actually maximum tag length. We need to check if
// tag is less than minimumKmerLength, toss that. And truncate if is greater
// than kmerLength.
if (seqAndQual[0].length() < minimumKmerLength()) {
//if (seqAndQual[0].length() < preferredTagLength) {
//System.out.println("Tossing kmer, length too short: " + seqAndQual[0].length());
// String errMsg = "\n\nERROR processing " + fastqFile.toString() + "\n" +
// "Reading entry number " + allReads + " fails the length test.\n" +
// "Sequence length " + seqAndQual[0].length() +
// " is less then maxKmerLength " + minimumKmerLength() + ".\n" +
// "Re-run your files with either a shorter mxKmerL value or a higher minimum quality score.\n";
//throw new StringIndexOutOfBoundsException(errMsg);
tooShortReads++;
continue; // for combined reads, just continue, don't error out.
}
//Tag tag = TagBuilder.instance(seqAndQual[0].substring(0,preferredTagLength)).build();
Tag tag = TagBuilder.instance(seqAndQual[0].substring(0,tagEnd)).build();
if(tag==null) continue; //null occurs when any base was not A, C, G, T
goodBarcodedReads++;
TaxaDistribution taxaDistribution=masterTagTaxaMap.get(tag);
if(taxaDistribution==null) {
masterTagTaxaMap.put(tag,TaxaDistBuilder.create(maxTaxaNumber,taxaIndex));
this.roughTagCnt.increment();
} else {
taxaDistribution.increment(taxaIndex);
}
// add to qualityScoreMap
// String tagQS = seqAndQual[1].substring(0,preferredTagLength);
String tagQS = seqAndQual[1].substring(0,tagEnd);
List tagScores = tagCntQSMap.get(tag);
if (tagScores==null) {
// create new synchronizedList
tagScores = Collections.synchronizedList(new ArrayList());
tagScores.add(tagQS);
tagCntQSMap.put(tag, tagScores);
} else {
tagScores.add(tagQS);
}
if (allReads % checkSize == 0) {
myLogger.info("Total Reads:" + allReads + " Reads with barcode and cut site overhang:" + goodBarcodedReads
+ " rate:" + (System.nanoTime()-time)/allReads +" ns/read. Current tag count:" + this.roughTagCnt);
}
}
myLogger.info("Summary for "+fastqFile.toString()+"\n"+
"Total number of reads in lane=" + allReads +"\n"+
"Total number of good barcoded reads=" + goodBarcodedReads+"\n"+
"Total number of low quality reads=" + lowQualityReads+"\n"+
"Total number of too short reads=" + tooShortReads + "\n"+
"Timing process (sorting, collapsing, and writing TagCount to file)."+"\n"+
"Process took " + (System.nanoTime() - time)/1e6 + " milliseconds.");
System.out.println("tagCntMap size: "+masterTagTaxaMap.size());
br.close();
} catch (StringIndexOutOfBoundsException oobe) {
throw oobe; // pass it up to print error and stop processing
} catch (Exception e) {
myLogger.error("Good Barcodes Read: " + goodBarcodedReads);
e.printStackTrace();
}
}
/**
* This method removes all tags are are never repeated in a single sample (taxa). The concept is that
* all biologically real tag should show up twice somewhere. This could be called at the end of every
* flowcell to test all the novel tags.
*
* In addition, it removes the corresponding entry from the tag-qualityscore map. They must remain in synch
*/
private static void removeTagsWithoutReplication (TagDistributionMap masterTagTaxaMap, TagCountQualityScoreMap tagCntQSMap,int minTaxa, int minDepth) {
int currentSize = masterTagTaxaMap.size();
// int minTaxa=2;
System.out.println("Starting removeTagsWithoutReplication. Current tag number: " + currentSize);
LongAdder tagsRemoved=new LongAdder();
masterTagTaxaMap.entrySet().parallelStream().forEach(t -> {
TaxaDistribution td = t.getValue();
if(td.totalDepth() depth>1).count()> calculateTagAveQS(TagCountQualityScoreMap tagCntQSMap, int qualityScoreBase) {
Map> tagInstanceAverageQS = new ConcurrentHashMap>();
// the number of instances for each tag equals the number of values on the arraylist
// iterate through the list, add values to the tagInstanceAverageQS map
tagCntQSMap.entrySet().parallelStream().forEach(entry -> {
Tag tag = entry.getKey();
List scores = entry.getValue();
// add all the scores, divide by number of scores, convert back to string
int numInstances = scores.size();
int scoreLen = scores.get(0).length();
// the scores should all be the same length as the tag
int[] scoreArray = new int[scoreLen];
for (int idx = 0; idx < numInstances; idx++) {
String currentScore = scores.get(idx);
for (int jdx = 0; jdx < scoreLen; jdx++) {
scoreArray[jdx] += (currentScore.charAt(jdx) - qualityScoreBase);
}
}
// positions for all scores are added. now divide by number of scores to get
// the average, translate value back to string
StringBuilder sb = new StringBuilder();
for (int jdx = 0; jdx < scoreLen; jdx++) {
scoreArray[jdx] /= numInstances;
char ch = (char)(scoreArray[jdx] + qualityScoreBase);
sb.append(ch);
}
// Store to map
Tuple instanceScore = new Tuple(numInstances,sb.toString());
tagInstanceAverageQS.put(tag, instanceScore);
});
return tagInstanceAverageQS;
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(RepGenLoadSeqToDBPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* Input directory containing FASTQ files in text or gzipped
* text.
* NOTE: Directory will be searched recursively and
* should
* be written WITHOUT a slash after its name.
*
* @return Input Directory
*/
public String inputDirectory() {
return myInputDir.value();
}
/**
* Set Input Directory. Input directory containing FASTQ
* files in text or gzipped text.
* NOTE: Directory will be searched recursively and
* should
* be written WITHOUT a slash after its name.
*
* @param value Input Directory
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin inputDirectory(String value) {
myInputDir = new PluginParameter<>(myInputDir, value);
return this;
}
/**
* Key file listing barcodes distinguishing the samples
*
* @return Key File
*/
public String keyFile() {
return myKeyFile.value();
}
/**
* Set Key File. Key file listing barcodes distinguishing
* the samples
*
* @param value Key File
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin keyFile(String value) {
myKeyFile = new PluginParameter<>(myKeyFile, value);
return this;
}
/**
* Maximum Tag Length
*
* @return Maximum Tag Length
*/
public Integer kmerLength() {
return myKmerLength.value();
}
/**
* Set Maximum Tag Length. Maximum Tag Length
*
* @param value Maximum Tag Length
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin kmerLength(Integer value) {
myKmerLength = new PluginParameter<>(myKmerLength, value);
return this;
}
/**
* Minimum Tag Length
*
* @return Minimum Tag Length
*/
public Integer minimumKmerLength() {
return myMinKmerLength.value();
}
/**
* Set Minimum Tag Length. Minimum Tag Length
*
* @param value Minimum Tag Length
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin minimumKmerLength(Integer value) {
myMinKmerLength = new PluginParameter<>(myMinKmerLength, value);
return this;
}
/**
* Minimum tag count
*
* @return Min Tag Count
*/
public Integer minKmerCount() {
return myMinKmerCount.value();
}
/**
* Set Min Tag Count. Minimum tag count
*
* @param value Min Tag Count
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin minKmerCount(Integer value) {
myMinKmerCount = new PluginParameter<>(myMinKmerCount, value);
return this;
}
/**
* Minimum taxa containing tag
*
* @return Min Taxa Count
*/
public Integer minTaxa() {
return minTaxa.value();
}
/**
* Set Min Taxa Count. Minimum Taxa count
*
* @param value Min Taxa containing a tag
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin minTaxa(Integer value) {
minTaxa = new PluginParameter<>(minTaxa, value);
return this;
}
/**
* Output Database File
*
* @return Output Database File
*/
public String outputDatabaseFile() {
return myOutputDB.value();
}
/**
* Set Output Database File. Output Database File
*
* @param value Output Database File
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin outputDatabaseFile(String value) {
myOutputDB = new PluginParameter<>(myOutputDB, value);
return this;
}
/**
* Minimum quality score within the barcode and read length
* to be accepted
*
* @return Minimum quality score
*/
public Integer minimumQualityScore() {
return myMinQualScore.value();
}
/**
* Set Minimum quality score. Minimum quality score within
* the barcode and read length to be accepted
*
* @param value Minimum quality score
*
* @return this plugin
*/
public RepGenLoadSeqToDBPlugin minimumQualityScore(Integer value) {
myMinQualScore = new PluginParameter<>(myMinQualScore, value);
return this;
}
/**
* Set maximum number of kmers
* @param value
* @return
*/
public RepGenLoadSeqToDBPlugin maximumKmerNumber(Integer value) {
myMaxKmerNumber = new PluginParameter<>(myMaxKmerNumber, value);
return this;
}
/**
* Set number of Fastq files processed simultaneously
* @param value
* @return
*/
public RepGenLoadSeqToDBPlugin batchSize(Integer value) {
myBatchSize = new PluginParameter<>(myBatchSize, value);
return this;
}
// /**
// * Delete exisiting DB
// *
// * @return deleteOldData
// */
// public Boolean deleteOldData() {
// return myDeleteOldData.value();
// }
//
// /**
// * Set Delete old data flag. True indicates we want the
// * db tables cleared
// *
// * @param value true/false - whether to delete data
// *
// * @return this plugin
// */
// public RepGenLoadSeqToDBPlugin deleteOldData(Boolean value) {
// myDeleteOldData = new PluginParameter<>(myDeleteOldData, value);
// return this;
// }
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Discovery Tags By Taxa";
}
@Override
public String getToolTipText() {
return "Discovery Tags By Taxa";
}
/**
* This ConcurrentHashMap constrain the size of the map, and purges low distribution count tags when the size needs
* to be reduced.
*/
static class TagDistributionMap extends ConcurrentHashMap {
private final int maxTagNum;
private int minDepthToRetainInMap=2;
private final int minCount;
TagDistributionMap (int maxTagNumber, float loadFactor, int concurrencyLevel, int minCount) {
super((maxTagNumber*2), loadFactor, concurrencyLevel);
maxTagNum = maxTagNumber;
this.minCount = minCount;
}
@Override
public TaxaDistribution put(Tag key, TaxaDistribution value) {
return super.put(key, value);
}
public synchronized void removeTagByCount(int minCnt) {
entrySet().parallelStream()
.filter(e -> e.getValue().totalDepth() remove(e.getKey()));
}
public long estimateMapMemorySize() {
long size=0;
int cnt=0;
for (Entry entry : entrySet()) {
size+=8+16+1; //Tag size
size+=16; //Map references
size+=entry.getValue().memorySize();
cnt++;
if(cnt>10000) break;
}
long estSize=(size()/cnt)*size;
return estSize;
}
public long[] depthDistribution() {
long[] base2bins=new long[34];
int cnt=0;
for (Entry entry : entrySet()) {
base2bins[31-Integer.numberOfLeadingZeros(entry.getValue().totalDepth())]++;
cnt++;
// if(cnt>100000) break;
}
return base2bins;
}
}
// LCJ - THe easiest and fastest method is to keep track of all the
// quality strings seen for each iteration of the tag. SOme of these
// tags will be dropped when we removeTagWIthoutReplication() and other
// methods. Those need to be synced up with this list.
//If we create a hashmap of tag/list-of-quality-strings, then after
// culling the list at the end, we can create the average score. The
// number of strings on the list give us the number of tag instances.
// THe map itself is a concurrent hashmap, but the list also must
// be concurrently. It can be defined as "List" but the value
// added must be a synchonizedList
// the list that is added needs to be a synchonized list.
// http://stackoverflow.com/questions/5923405/concurrenthashmap-with-arraylist-as-value
// Cannot make this a multimap as multimap will not store duplicate quality scores,
public static class TagCountQualityScoreMap extends ConcurrentHashMap> {
private int tagCount;
TagCountQualityScoreMap (int maxTagNumber, float loadFactor, int concurrencyLevel, int minCount) {
super((maxTagNumber*2), loadFactor, concurrencyLevel);
}
public synchronized void removeTagByCount(int minCnt) {
// remove tags which are not represented a minimum number of times
// THis is determimed by how many quality strings occur for the tag
// should be consistent with removeTagByCount for TagDistributionMap
entrySet().parallelStream()
.filter(e -> e.getValue().size() remove(e.getKey()));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy