All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.KmerToTBTPlugin Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
/*
 * KmerToTBTPlugin
 */
package net.maizegenetics.analysis.gbs;

import net.maizegenetics.dna.tag.TagsByTaxa;
import net.maizegenetics.dna.tag.TagsByTaxaShort;
import net.maizegenetics.dna.tag.TagsByTaxaByte;
import net.maizegenetics.dna.tag.TagCounts;
import net.maizegenetics.dna.tag.Tags;
import net.maizegenetics.dna.map.TagsOnPhysicalMap;
import net.maizegenetics.dna.tag.TagsByTaxa.FilePacking;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.util.ArgsEngine;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.MultiMemberGZIPInputStream;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.*;
import java.util.HashMap;

/**
 * This pipeline converts a series of fastq files to TagsByTaxa files (one per
 * fastq file). It requires a list of existing tags (Tags object), which may
 * come from a TagCounts file or TOPM file.
 *
 * @author Qi
 */
public class KmerToTBTPlugin extends AbstractPlugin {

    private static final Logger myLogger = Logger.getLogger(KmerToTBTPlugin.class);
    private ArgsEngine myArgsEngine = null;
    private String[] myFastqFileS = null;
    private String myOutputDir = null;
    private int myMinCount = 1;
    private Tags myMasterTags = null;
    private static int maxGoodReads = 200000000; // maximum number of good barcoded reads expected in a fastq file
    private static String nullS = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
    private boolean useTBTByte = false;
    private boolean useTBTShort = false;

    public KmerToTBTPlugin() {
        super(null, false);
    }

    public KmerToTBTPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    @Override
    public DataSet performFunction(DataSet input) {
        matchTagsToTaxa(myFastqFileS, myMasterTags, myOutputDir, myMinCount, useTBTByte, useTBTShort);
        return null;
    }

    private void printUsage() {
        myLogger.info(
                "\nUsage is as follows:\n"
                + "-i  Input directory containing .kmer.fa files\n"
                + "-o  Output directory\n"
                + "-s  Max good reads per lane. (Optional. Default is 200,000,000).\n"
                + "-c  Minimum taxa count within a fastq file for a tag to be output (default 1)\n" // Nb: using TagsByTaxaBit, so max count PER TAXON = 1
                + "-y  Output to tagsByTaxaByte (tag counts per taxon from 0 to 127) instead of tagsByTaxaBit (0 or 1)\n"
                + "    -t  Tag count file, OR A\n");
    }

    @Override
    public void setParameters(String[] args) {
        if (args.length == 0) {
            printUsage();
            throw new IllegalArgumentException("\n\nPlease use the above arguments/options.\n\n");
        }

        if (myArgsEngine == null) {
            myArgsEngine = new ArgsEngine();
            myArgsEngine.add("-i", "--input-directory", true);
            myArgsEngine.add("-o", "--output-directory", true);
            myArgsEngine.add("-s", "--max-reads", true);
            myArgsEngine.add("-c", "--min-count", true);
            myArgsEngine.add("-y", "--TBTByte", false);
            myArgsEngine.add("-sh", "--TBTShort", false);
            myArgsEngine.add("-t", "--tag-count", true);
            myArgsEngine.add("-m", "--physical-map", true);
        }
        myArgsEngine.parse(args);

        String tempDirectory = myArgsEngine.getString("-i");

        if (myArgsEngine.getBoolean("-s")) {
            maxGoodReads = Integer.parseInt(myArgsEngine.getString("-s"));
        }

        if (tempDirectory != null) {
            File fastqDirectory = new File(tempDirectory);
            if (!fastqDirectory.isDirectory()) {
                printUsage();
                throw new IllegalArgumentException("setParameters: The input name you supplied is not a directory: " + tempDirectory);
            }
            myFastqFileS = DirectoryCrawler.listFileNames("(?i).*\\.kmer\\.fa$|.*\\.kmer\\.fa\\.gz$", fastqDirectory.getAbsolutePath());
            //    (?i) denotes case insensitive;                 \\. denotes escape . so it doesn't mean 'any char'
            // NOTE: If you add addtional file naming conventions here, you must also add them to the "outfile = new File(outFileS.replaceAll()" list below (near the bottom)
            if (myFastqFileS.length == 0 || myFastqFileS == null) {
                printUsage();
                throw new IllegalArgumentException(
                        "Couldn't find any files that end with \".kmer.fa\", \".kmer.fa.gz\" in the supplied directory: "
                        + tempDirectory);
            } else {
                myLogger.info("KmerToTBTPlugin: setParameters: Using the following fastq files:");
                for (String filename : myFastqFileS) {
                    myLogger.info(filename);
                }
            }
        }

        if (myArgsEngine.getBoolean("-o")) {
            myOutputDir = myArgsEngine.getString("-o");
            File outDirectory = new File(myOutputDir);
            if (!outDirectory.isDirectory()) {
                printUsage();
                throw new IllegalArgumentException("The output name you supplied (option -o) is not a directory: " + myOutputDir);
            }
            outDirectory = null;
        } else {
            printUsage();
            throw new IllegalArgumentException("Please specify an output directory (option -o).");
        }
        if (myArgsEngine.getBoolean("-c")) {
            myMinCount = Integer.parseInt(myArgsEngine.getString("-c"));
        } else {
            myMinCount = 1;
        }
        if (myArgsEngine.getBoolean("-y")) {
            useTBTByte = true;
        }
        if (myArgsEngine.getBoolean("-y")) {
            if (myArgsEngine.getBoolean("-sh")) {
                printUsage();
                throw new IllegalArgumentException("Options -y and -sh are mutually exclusive.");
            }
            useTBTByte = true;
        } else if (myArgsEngine.getBoolean("-sh")) {
            if (myArgsEngine.getBoolean("-y")) {
                printUsage();
                throw new IllegalArgumentException("Options -y and -sh are mutually exclusive.");
            }
            useTBTShort = true;
        } else {
            printUsage();
            throw new IllegalArgumentException("Please specify byte format (-y) *OR* short int format (-sh).");
        }

        // Create Tags object from tag count file with option -t, or from TOPM file with option -m
        if (myArgsEngine.getBoolean("-t")) {
            if (myArgsEngine.getBoolean("-m")) {
                printUsage();
                throw new IllegalArgumentException("Options -t and -m are mutually exclusive.");
            }
            myMasterTags = new TagCounts(myArgsEngine.getString("-t"), FilePacking.Byte);
        } else if (myArgsEngine.getBoolean("-m")) {
            if (myArgsEngine.getBoolean("-t")) {
                printUsage();
                throw new IllegalArgumentException("Options -t and -m are mutually exclusive.");
            }
            myMasterTags = new TagsOnPhysicalMap(myArgsEngine.getString("-m"), true);
        } else {
            printUsage();
            throw new IllegalArgumentException("Please specify a tagCounts file (-t) *OR* a TagsOnPhysicalMap file (-m)");
        }
    }

    /**
     * Uses an existing Tags object to create one TagsByTaxa file for each fastq
     * file in the input directory.
     *
     * Output TBT files written to the outputDir, using fastq file names with
     * extension changed to .tbt.bin (or .tbt.txt)
     *
     * @param fastqFileS Array of fastq file names (Illumina-created files with
     * raw read sequence, quality score, machine name, etc.)
     * @param theMasterTags A Tags object: list of tags to be included in the
     * final TBT
     * @param outputDir String containing the path of the output directory to
     * contain tags-by-taxa files
     * @param minCount The minimum number of times a tag must show up in a fastq
     * file before it is included in the corresponding TBT file
     */
    public static void matchTagsToTaxa(String[] fastqFileS, Tags theMasterTags, String outputDir, int minCount, boolean useTBTByte, boolean useTBTShort) {
        HashMap readsPerSample = new HashMap();       //Maps samples to #reads/sample
        HashMap mappedReadsPerSample = new HashMap();       //Maps samples to #reads/sample

        for (int laneNum = 0; laneNum < fastqFileS.length; laneNum++) {


            //Determine name of output file based on input parameters
            File outfile;
            FilePacking outFormat = useTBTByte ? FilePacking.Byte : (useTBTShort ? FilePacking.Short : FilePacking.Byte);
            String outFileS = outputDir + fastqFileS[laneNum].substring(fastqFileS[laneNum].lastIndexOf(File.separator));
            String replaceS = (outFormat == FilePacking.Text) ? ".tbt.txt" : ((outFormat == FilePacking.Byte) ? ".tbt.byte" : ((outFormat == FilePacking.Short) ? ".tbt.shrt" : ".tbt.bin"));
            outfile = new File(outFileS.replaceAll(
                    "(?i)\\.kmer\\.fa$|\\.kmer\\.fa\\.gz$",
                    replaceS));

            //Skip input file if a corresponding output file has already been written.
            if (outfile.isFile()) {
                System.out.println(
                        "An output file " + outfile.getName() + "\n"
                        + " already exists in the output directory for file " + fastqFileS[laneNum] + ".  Skipping.");
                continue;
            }

            System.out.println("\nWorking on kmer file: " + fastqFileS[laneNum]);
            TagsByTaxa theTBT = null;
            System.gc();
            int allReads = 0, goodMatched = 0;
            File fastqFile = new File(fastqFileS[laneNum]);
            String taxaName = fastqFile.getName();
            taxaName = taxaName.substring(0, taxaName.indexOf("."));

            //Create a new object to hold barcoded tags.  The constructor can optionally process a group of fastq
            //files.  A minimum quality score for inclusion of a read can also be provided.


            //Fill an array with taxon names.
            String[] taxaNames = new String[1];
            taxaNames[0] = taxaName;

            if (useTBTByte) {
                theTBT = new TagsByTaxaByte(taxaNames, theMasterTags);
            } else if (useTBTShort) {
                theTBT = new TagsByTaxaShort(taxaNames, theMasterTags);
            }

            // Read the fastq file and assign reads to tags and taxa
            String temp = "";
            allReads = 0;
            goodMatched = 0;
            try {
                BufferedReader br;
                //Read in fastq file as a gzipped text stream if its name ends in ".gz", otherwise read as text
                if (fastqFileS[laneNum].endsWith(".gz")) {
                    br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(fastqFileS[laneNum]))));
                } else {
                    br = new BufferedReader(new FileReader(fastqFileS[laneNum]), 65536);
                }
                String sl = "";
                long[] binary_s1 = new long[2];
                int kmercount;
                while (((temp = br.readLine()) != null) && (allReads < maxGoodReads)) {
                    allReads++;

                    kmercount = Integer.parseInt(temp.replaceAll("[^\\d]", ""));
                    sl = br.readLine().trim();
                    sl = sl + nullS;
                    sl = sl.substring(0, 64);

                    binary_s1 = BaseEncoder.getLongArrayFromSeq(sl);
                    int h = theTBT.getTagIndex(binary_s1);
                    if (h > -1) {
                        theTBT.addReadsToTagTaxon(h, 0, 1);
                        goodMatched++;
                    }
                    if (allReads % 1000000 == 0) {
                        System.out.println("Total Reads:" + allReads + " goodMatched:" + goodMatched);
                    }
                }
                br.close();
            } catch (Exception e) {
                System.out.println("Catch testBasicPipeline read at =" + allReads + " e=" + e);
                System.out.println(temp);
                e.printStackTrace();
            }
            readsPerSample.put(taxaName, allReads);
            mappedReadsPerSample.put(taxaName, allReads);
            System.out.println("Timing process (writing TagsByTaxa file)...");
            long timePoint1 = System.currentTimeMillis();
            theTBT.writeDistFile(outfile, outFormat, minCount);
            System.out.println("...process (writing KmerByTaxa file) took " + (System.currentTimeMillis() - timePoint1) + " milliseconds.");
            System.out.println("Total number of kmer in file=" + allReads);
            System.out.println("Total number of matched reads=" + goodMatched);
            int filesDone = laneNum + 1;
            System.out.println("Finished reading " + filesDone + " of " + fastqFileS.length + " sequence files: " + fastqFileS[laneNum] + "\n");
        }
        try {
            String ofname = outputDir + "kmer.log";
            DataOutputStream report = new DataOutputStream(new FileOutputStream(ofname));
            report.writeBytes("name" + '\t' + "read count" + '\t' + "mapped read count" + '\n');

            for (String name : readsPerSample.keySet()) {
                int count = readsPerSample.get(name);
                int mappedCount = mappedReadsPerSample.get(name);
                report.writeBytes(name + '\t' + count + '\t' + mappedCount + '\n');
            }
            report.close();
        } catch (Exception e) {
            myLogger.warn("Caught exception while writing report file! " + e);
        }
    }

    @Override
    public ImageIcon getIcon() {
        throw new UnsupportedOperationException("Not supported yet.");
    }

    @Override
    public String getButtonName() {
        throw new UnsupportedOperationException("Not supported yet.");
    }

    @Override
    public String getToolTipText() {
        throw new UnsupportedOperationException("Not supported yet.");
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy