
net.maizegenetics.analysis.gbs.KmerToTBTPlugin Maven / Gradle / Ivy
/*
* KmerToTBTPlugin
*/
package net.maizegenetics.analysis.gbs;
import net.maizegenetics.dna.tag.TagsByTaxa;
import net.maizegenetics.dna.tag.TagsByTaxaShort;
import net.maizegenetics.dna.tag.TagsByTaxaByte;
import net.maizegenetics.dna.tag.TagCounts;
import net.maizegenetics.dna.tag.Tags;
import net.maizegenetics.dna.map.TagsOnPhysicalMap;
import net.maizegenetics.dna.tag.TagsByTaxa.FilePacking;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.util.ArgsEngine;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.MultiMemberGZIPInputStream;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.*;
import java.util.HashMap;
/**
* This pipeline converts a series of fastq files to TagsByTaxa files (one per
* fastq file). It requires a list of existing tags (Tags object), which may
* come from a TagCounts file or TOPM file.
*
* @author Qi
*/
public class KmerToTBTPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(KmerToTBTPlugin.class);
private ArgsEngine myArgsEngine = null;
private String[] myFastqFileS = null;
private String myOutputDir = null;
private int myMinCount = 1;
private Tags myMasterTags = null;
private static int maxGoodReads = 200000000; // maximum number of good barcoded reads expected in a fastq file
private static String nullS = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
private boolean useTBTByte = false;
private boolean useTBTShort = false;
public KmerToTBTPlugin() {
super(null, false);
}
public KmerToTBTPlugin(Frame parentFrame) {
super(parentFrame, false);
}
@Override
public DataSet performFunction(DataSet input) {
matchTagsToTaxa(myFastqFileS, myMasterTags, myOutputDir, myMinCount, useTBTByte, useTBTShort);
return null;
}
private void printUsage() {
myLogger.info(
"\nUsage is as follows:\n"
+ "-i Input directory containing .kmer.fa files\n"
+ "-o Output directory\n"
+ "-s Max good reads per lane. (Optional. Default is 200,000,000).\n"
+ "-c Minimum taxa count within a fastq file for a tag to be output (default 1)\n" // Nb: using TagsByTaxaBit, so max count PER TAXON = 1
+ "-y Output to tagsByTaxaByte (tag counts per taxon from 0 to 127) instead of tagsByTaxaBit (0 or 1)\n"
+ " -t Tag count file, OR A\n");
}
@Override
public void setParameters(String[] args) {
if (args.length == 0) {
printUsage();
throw new IllegalArgumentException("\n\nPlease use the above arguments/options.\n\n");
}
if (myArgsEngine == null) {
myArgsEngine = new ArgsEngine();
myArgsEngine.add("-i", "--input-directory", true);
myArgsEngine.add("-o", "--output-directory", true);
myArgsEngine.add("-s", "--max-reads", true);
myArgsEngine.add("-c", "--min-count", true);
myArgsEngine.add("-y", "--TBTByte", false);
myArgsEngine.add("-sh", "--TBTShort", false);
myArgsEngine.add("-t", "--tag-count", true);
myArgsEngine.add("-m", "--physical-map", true);
}
myArgsEngine.parse(args);
String tempDirectory = myArgsEngine.getString("-i");
if (myArgsEngine.getBoolean("-s")) {
maxGoodReads = Integer.parseInt(myArgsEngine.getString("-s"));
}
if (tempDirectory != null) {
File fastqDirectory = new File(tempDirectory);
if (!fastqDirectory.isDirectory()) {
printUsage();
throw new IllegalArgumentException("setParameters: The input name you supplied is not a directory: " + tempDirectory);
}
myFastqFileS = DirectoryCrawler.listFileNames("(?i).*\\.kmer\\.fa$|.*\\.kmer\\.fa\\.gz$", fastqDirectory.getAbsolutePath());
// (?i) denotes case insensitive; \\. denotes escape . so it doesn't mean 'any char'
// NOTE: If you add addtional file naming conventions here, you must also add them to the "outfile = new File(outFileS.replaceAll()" list below (near the bottom)
if (myFastqFileS.length == 0 || myFastqFileS == null) {
printUsage();
throw new IllegalArgumentException(
"Couldn't find any files that end with \".kmer.fa\", \".kmer.fa.gz\" in the supplied directory: "
+ tempDirectory);
} else {
myLogger.info("KmerToTBTPlugin: setParameters: Using the following fastq files:");
for (String filename : myFastqFileS) {
myLogger.info(filename);
}
}
}
if (myArgsEngine.getBoolean("-o")) {
myOutputDir = myArgsEngine.getString("-o");
File outDirectory = new File(myOutputDir);
if (!outDirectory.isDirectory()) {
printUsage();
throw new IllegalArgumentException("The output name you supplied (option -o) is not a directory: " + myOutputDir);
}
outDirectory = null;
} else {
printUsage();
throw new IllegalArgumentException("Please specify an output directory (option -o).");
}
if (myArgsEngine.getBoolean("-c")) {
myMinCount = Integer.parseInt(myArgsEngine.getString("-c"));
} else {
myMinCount = 1;
}
if (myArgsEngine.getBoolean("-y")) {
useTBTByte = true;
}
if (myArgsEngine.getBoolean("-y")) {
if (myArgsEngine.getBoolean("-sh")) {
printUsage();
throw new IllegalArgumentException("Options -y and -sh are mutually exclusive.");
}
useTBTByte = true;
} else if (myArgsEngine.getBoolean("-sh")) {
if (myArgsEngine.getBoolean("-y")) {
printUsage();
throw new IllegalArgumentException("Options -y and -sh are mutually exclusive.");
}
useTBTShort = true;
} else {
printUsage();
throw new IllegalArgumentException("Please specify byte format (-y) *OR* short int format (-sh).");
}
// Create Tags object from tag count file with option -t, or from TOPM file with option -m
if (myArgsEngine.getBoolean("-t")) {
if (myArgsEngine.getBoolean("-m")) {
printUsage();
throw new IllegalArgumentException("Options -t and -m are mutually exclusive.");
}
myMasterTags = new TagCounts(myArgsEngine.getString("-t"), FilePacking.Byte);
} else if (myArgsEngine.getBoolean("-m")) {
if (myArgsEngine.getBoolean("-t")) {
printUsage();
throw new IllegalArgumentException("Options -t and -m are mutually exclusive.");
}
myMasterTags = new TagsOnPhysicalMap(myArgsEngine.getString("-m"), true);
} else {
printUsage();
throw new IllegalArgumentException("Please specify a tagCounts file (-t) *OR* a TagsOnPhysicalMap file (-m)");
}
}
/**
* Uses an existing Tags object to create one TagsByTaxa file for each fastq
* file in the input directory.
*
* Output TBT files written to the outputDir, using fastq file names with
* extension changed to .tbt.bin (or .tbt.txt)
*
* @param fastqFileS Array of fastq file names (Illumina-created files with
* raw read sequence, quality score, machine name, etc.)
* @param theMasterTags A Tags object: list of tags to be included in the
* final TBT
* @param outputDir String containing the path of the output directory to
* contain tags-by-taxa files
* @param minCount The minimum number of times a tag must show up in a fastq
* file before it is included in the corresponding TBT file
*/
public static void matchTagsToTaxa(String[] fastqFileS, Tags theMasterTags, String outputDir, int minCount, boolean useTBTByte, boolean useTBTShort) {
HashMap readsPerSample = new HashMap(); //Maps samples to #reads/sample
HashMap mappedReadsPerSample = new HashMap(); //Maps samples to #reads/sample
for (int laneNum = 0; laneNum < fastqFileS.length; laneNum++) {
//Determine name of output file based on input parameters
File outfile;
FilePacking outFormat = useTBTByte ? FilePacking.Byte : (useTBTShort ? FilePacking.Short : FilePacking.Byte);
String outFileS = outputDir + fastqFileS[laneNum].substring(fastqFileS[laneNum].lastIndexOf(File.separator));
String replaceS = (outFormat == FilePacking.Text) ? ".tbt.txt" : ((outFormat == FilePacking.Byte) ? ".tbt.byte" : ((outFormat == FilePacking.Short) ? ".tbt.shrt" : ".tbt.bin"));
outfile = new File(outFileS.replaceAll(
"(?i)\\.kmer\\.fa$|\\.kmer\\.fa\\.gz$",
replaceS));
//Skip input file if a corresponding output file has already been written.
if (outfile.isFile()) {
System.out.println(
"An output file " + outfile.getName() + "\n"
+ " already exists in the output directory for file " + fastqFileS[laneNum] + ". Skipping.");
continue;
}
System.out.println("\nWorking on kmer file: " + fastqFileS[laneNum]);
TagsByTaxa theTBT = null;
System.gc();
int allReads = 0, goodMatched = 0;
File fastqFile = new File(fastqFileS[laneNum]);
String taxaName = fastqFile.getName();
taxaName = taxaName.substring(0, taxaName.indexOf("."));
//Create a new object to hold barcoded tags. The constructor can optionally process a group of fastq
//files. A minimum quality score for inclusion of a read can also be provided.
//Fill an array with taxon names.
String[] taxaNames = new String[1];
taxaNames[0] = taxaName;
if (useTBTByte) {
theTBT = new TagsByTaxaByte(taxaNames, theMasterTags);
} else if (useTBTShort) {
theTBT = new TagsByTaxaShort(taxaNames, theMasterTags);
}
// Read the fastq file and assign reads to tags and taxa
String temp = "";
allReads = 0;
goodMatched = 0;
try {
BufferedReader br;
//Read in fastq file as a gzipped text stream if its name ends in ".gz", otherwise read as text
if (fastqFileS[laneNum].endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(fastqFileS[laneNum]))));
} else {
br = new BufferedReader(new FileReader(fastqFileS[laneNum]), 65536);
}
String sl = "";
long[] binary_s1 = new long[2];
int kmercount;
while (((temp = br.readLine()) != null) && (allReads < maxGoodReads)) {
allReads++;
kmercount = Integer.parseInt(temp.replaceAll("[^\\d]", ""));
sl = br.readLine().trim();
sl = sl + nullS;
sl = sl.substring(0, 64);
binary_s1 = BaseEncoder.getLongArrayFromSeq(sl);
int h = theTBT.getTagIndex(binary_s1);
if (h > -1) {
theTBT.addReadsToTagTaxon(h, 0, 1);
goodMatched++;
}
if (allReads % 1000000 == 0) {
System.out.println("Total Reads:" + allReads + " goodMatched:" + goodMatched);
}
}
br.close();
} catch (Exception e) {
System.out.println("Catch testBasicPipeline read at =" + allReads + " e=" + e);
System.out.println(temp);
e.printStackTrace();
}
readsPerSample.put(taxaName, allReads);
mappedReadsPerSample.put(taxaName, allReads);
System.out.println("Timing process (writing TagsByTaxa file)...");
long timePoint1 = System.currentTimeMillis();
theTBT.writeDistFile(outfile, outFormat, minCount);
System.out.println("...process (writing KmerByTaxa file) took " + (System.currentTimeMillis() - timePoint1) + " milliseconds.");
System.out.println("Total number of kmer in file=" + allReads);
System.out.println("Total number of matched reads=" + goodMatched);
int filesDone = laneNum + 1;
System.out.println("Finished reading " + filesDone + " of " + fastqFileS.length + " sequence files: " + fastqFileS[laneNum] + "\n");
}
try {
String ofname = outputDir + "kmer.log";
DataOutputStream report = new DataOutputStream(new FileOutputStream(ofname));
report.writeBytes("name" + '\t' + "read count" + '\t' + "mapped read count" + '\n');
for (String name : readsPerSample.keySet()) {
int count = readsPerSample.get(name);
int mappedCount = mappedReadsPerSample.get(name);
report.writeBytes(name + '\t' + count + '\t' + mappedCount + '\n');
}
report.close();
} catch (Exception e) {
myLogger.warn("Caught exception while writing report file! " + e);
}
}
@Override
public ImageIcon getIcon() {
throw new UnsupportedOperationException("Not supported yet.");
}
@Override
public String getButtonName() {
throw new UnsupportedOperationException("Not supported yet.");
}
@Override
public String getToolTipText() {
throw new UnsupportedOperationException("Not supported yet.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy