All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.SeqToTBTHDF5Plugin Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
/*
 * SeqToTBTHDF5Plugin
 */
package net.maizegenetics.analysis.gbs;

import cern.colt.list.IntArrayList;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.IOException;

import net.maizegenetics.dna.map.TagsOnPhysicalMap;
import net.maizegenetics.dna.tag.TagCounts;
import net.maizegenetics.dna.tag.Tags;
import net.maizegenetics.dna.tag.TagsByTaxa.FilePacking;
import net.maizegenetics.dna.tag.TagsByTaxaByteHDF5TaxaGroups;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.MultiMemberGZIPInputStream;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;

import java.awt.Frame;

import java.util.HashMap;

import javax.swing.ImageIcon;
import net.maizegenetics.plugindef.PluginParameter;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
 * This pipeline converts a series of fastq or qseq files to a single TagsByTaxa
 * HDF5 file. It requires a list of existing tags (Tags object), which may come
 * from a TagCounts file or TOPM file.
 *
 * @author ed and james
 */
public class SeqToTBTHDF5Plugin extends AbstractPlugin {

    private static final Logger myLogger = LogManager.getLogger(SeqToTBTHDF5Plugin.class);

    private PluginParameter myInputDir = new PluginParameter.Builder("i", null, String.class).guiName("Input Directory").required(true).inDir()
            .description("Input directory containing .fastq files").build();
    private PluginParameter myKeyFile = new PluginParameter.Builder("k", null, String.class).guiName("Key File").required(true).inFile()
            .description("Barcode key file").build();
    private PluginParameter myEnzyme = new PluginParameter.Builder("e", null, String.class).guiName("Enzyme")
            .description("Enzyme used to create the GBS library, if it differs from the one listed in the key file.").build();
    private PluginParameter myOutputFile = new PluginParameter.Builder("o", null, String.class).guiName("Output File").required(true).outFile()
            .description("Output HDF5 file").build();
    private PluginParameter myMaxGoodReads = new PluginParameter.Builder("s", 500_000_000, Integer.class).guiName("Max Good Reads")
            .description("Max good reads per lane.").build();
    private PluginParameter myLogFile = new PluginParameter.Builder("L", null, String.class).guiName("Log File").required(true).outFile()
            .description("Output log file").build();
    private PluginParameter myTagCountFile = new PluginParameter.Builder("t", null, String.class).guiName("Tag Count File").inFile()
            .description("Tag count file. (Only -t or -m allowed)").build();
    private PluginParameter myPhysicalMapFile = new PluginParameter.Builder("m", null, String.class).guiName("Physical Map File").inFile()
            .description("Physical map file containing alignments. (Only -t or -m allowed)").build();

    private String[] myFastqFileS = null;
    private Tags myMasterTags = null;
    private IntArrayList[] myTaxaReads;
    private int[] readsPerSample;
    private int[] mappedReadsPerSample;
    private int goodBarcodedReads = 0;
    private int allReads = 0;
    private int goodMatched = 0;
    private HashMap taxaNameToIndices;

    public SeqToTBTHDF5Plugin() {
        super(null, false);
    }

    public SeqToTBTHDF5Plugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public DataSet processData(DataSet input) {
        File possibleFile = new File(outputFile());
        if (possibleFile.exists() == false) {
            TagsByTaxaByteHDF5TaxaGroups theTBT = new TagsByTaxaByteHDF5TaxaGroups(myMasterTags, outputFile());
        }
        matchTagsToTaxa(myFastqFileS, keyFile(), enzyme(), myMasterTags, outputFile(), logFile());
        return null;
    }

    @Override
    public void postProcessParameters() {

        String inputDirectory = inputDirectory();

        if (inputDirectory != null) {
            File fastqDirectory = new File(inputDirectory);
            if (!fastqDirectory.isDirectory()) {
                throw new IllegalArgumentException("setParameters: The input name you supplied is not a directory: " + inputDirectory);
            }
            myFastqFileS = DirectoryCrawler.listFileNames("(?i).*\\.fq$|.*\\.fq\\.gz$|.*\\.fastq$|.*_fastq\\.txt$|.*_fastq\\.gz$|.*_fastq\\.txt\\.gz$|.*_sequence\\.txt$|.*_sequence\\.txt\\.gz$|.*_qseq\\.txt$|.*_qseq\\.txt\\.gz$", fastqDirectory.getAbsolutePath());
            //    (?i) denotes case insensitive;                 \\. denotes escape . so it doesn't mean 'any char'
            // NOTE: If you add addtional file naming conventions here, you must also add them to the "outfile = new File(outFileS.replaceAll()" list below (near the bottom)
            if (myFastqFileS.length == 0 || myFastqFileS == null) {
                throw new IllegalArgumentException(
                        "Couldn't find any files that end with \".fq\", \".fq.gz\", \".fastq\", \"_fastq.txt\", \"_fastq.gz\", \"_fastq.txt.gz\", \"_sequence.txt\", or \"_sequence.txt.gz\" in the supplied directory: "
                        + inputDirectory);
            } else {
                myLogger.info("SeqToTBTHDF5Plugin: setParameters: Using the following fastq files:");
                for (String filename : myFastqFileS) {
                    myLogger.info(filename);
                }
            }
        }

        if (enzyme() == null) {
            myLogger.warn("No enzyme specified.  Using enzyme listed in key file.");
        }

        // Create Tags object from tag count file with option -t, or from TOPM file with option -m
        if ((tagCountFile() != null) && !tagCountFile().isEmpty()) {
            if ((physicalMapFile() != null) && !physicalMapFile().isEmpty()) {
                throw new IllegalArgumentException("Options -t and -m are mutually exclusive.");
            }
            myMasterTags = new TagCounts(tagCountFile(), FilePacking.Byte);
        } else if ((physicalMapFile() != null) && !physicalMapFile().isEmpty()) {
            if ((tagCountFile() != null) && !tagCountFile().isEmpty()) {
                throw new IllegalArgumentException("Options -t and -m are mutually exclusive.");
            }
            myMasterTags = new TagsOnPhysicalMap(physicalMapFile(), true);
        } else {
            throw new IllegalArgumentException("Please specify a tagCounts file (-t) *OR* a TagsOnPhysicalMap file (-m)");
        }
    }

    /**
     * Uses an existing Tags object to create one TagsByTaxa file for each fastq
     * file in the input directory.
     *
     * Output TBT files written to the outputDir, using fastq file names with
     * extension changed to .tbt.bin (or .tbt.txt)
     *
     * @param fastqFileS Array of fastq file names (Illumina-created files with
     * raw read sequence, quality score, machine name, etc.)
     * @param keyFileS A key file (list of taxa by barcode, lane & flow cell,
     * including plate maps)
     * @param enzyme The enzyme used to make the library (currently ApeKI or
     * PstI)
     * @param theMasterTags A Tags object: list of tags to be included in the
     * final TBT
     * @param outputDir String containing the path of the output directory to
     * contain tags-by-taxa files
     * @param minCount The minimum number of times a tag must show up in a fastq
     * file before it is included in the corresponding TBT file
     */
    public void matchTagsToTaxa(String[] fastqFileS, String keyFileS, String enzyme, Tags theMasterTags, String outputTBT, String outputLog) {
        for (int laneNum = 0; laneNum < fastqFileS.length; laneNum++) {
            //            IntArrayList[] taxaReads;
            //            int[] readsPerSample, mappedReadsPerSample;
            //            int goodBarcodedReads = 0, allReads = 0, goodMatched = 0;

            goodBarcodedReads = allReads = goodMatched = 0;

            boolean isFastQ = true;
            if (fastqFileS[laneNum].substring(fastqFileS[laneNum].lastIndexOf(File.separator)).contains("qseq")) {
                isFastQ = false;
            }

            System.out.println("\nWorking on fastq file: " + fastqFileS[laneNum]);

            File fastqFile = new File(fastqFileS[laneNum]);

            ParseBarcodeRead thePBR = initParseBarcodeRead(keyFileS, enzyme, fastqFile);
            if (thePBR == null) {
                continue;  //if no barcodes found move on
            }
            //Fill an array with taxon names.
            String[] taxaNames = new String[thePBR.getBarCodeCount()];
            myTaxaReads = new IntArrayList[thePBR.getBarCodeCount()];
            mappedReadsPerSample = new int[thePBR.getBarCodeCount()];
            readsPerSample = new int[thePBR.getBarCodeCount()];
            taxaNameToIndices = new HashMap();

            for (int i = 0; i < taxaNames.length; i++) {
                taxaNames[i] = thePBR.getTheBarcodes(i).getTaxaName();
                myTaxaReads[i] = new IntArrayList(200000000 / taxaNames.length);
                taxaNameToIndices.put(taxaNames[i], i);
            }
            int currLine = 0;
            goodBarcodedReads = 0;
            allReads = 0;
            goodMatched = 0;
            try {
                BufferedReader br = getBufferedReader(fastqFileS[laneNum]);
                String[] seqAndQual;
                while (((seqAndQual = getNextSeq(br, isFastQ)) != null) && (goodBarcodedReads < maxGoodReads())) {
                    allReads++;
                    currLine++;
                    //After quality score is read, decode barcode using the current sequence & quality  score
                    ReadBarcodeResult rr = thePBR.parseReadIntoTagAndTaxa(seqAndQual[0], seqAndQual[1], true, 0);
                    if (rr != null) {  //If read is barcoded, note this fact and find tag in the TOPM file
                        goodBarcodedReads++;
                        int t = taxaNameToIndices.get(rr.getTaxonName());
                        int h = theMasterTags.getTagIndex(rr.getRead());
                        readsPerSample[t]++;
                        if (h > -1) {
                            goodMatched++;
                            mappedReadsPerSample[t]++;
                            myTaxaReads[t].add(h);
                        }
                    }
                    if (allReads % 1000000 == 0) {
                        System.out.println("Total Reads:" + allReads + " goodReads:" + goodBarcodedReads + " goodMatched:" + goodMatched);
                    }
                }
                br.close();
            } catch (Exception e) {
                System.out.println("Catch testBasicPipeline c=" + goodBarcodedReads + " e=" + e);
                e.printStackTrace();
            }
            System.out.println("Timing process (writing TagsByTaxa file)...");
            long timePoint1 = System.currentTimeMillis();
            writeReport(fastqFileS[laneNum], outputLog);
            writeTBT(outputTBT);
            System.out.println("...process (writing TagsByTaxa file) took " + (System.currentTimeMillis() - timePoint1) + " milliseconds.");
            System.out.println("Total number of reads in lane=" + allReads);
            System.out.println("Total number of good, barcoded reads=" + goodBarcodedReads);
            int filesDone = laneNum + 1;
            System.out.println("Finished reading " + filesDone + " of " + fastqFileS.length + " sequence files: " + fastqFileS[laneNum] + "\n");
//            //Write report to a file named after the TBT file, with the extension ".log"
//            float goodPct = ((float) goodBarcodedReads) / ((float) allReads);
//            float goodMappedPct = ((float) goodMatched) / ((float) allReads);
//            try {
//                DataOutputStream report = new DataOutputStream(new FileOutputStream(outputLog, true));
//                report.writeBytes(
//                        "File: " + fastqFileS[laneNum] + '\n'
//                        + "Total reads: " + allReads + '\n'
//                        + "Accepted reads (with barcode and cut site): " + goodBarcodedReads + "(" + goodPct + " of total)" + '\n'
//                        + "Accepted reads found in TOPM: " + goodMatched + "(" + goodMappedPct + " of total)" + '\n'
//                        + "name" + '\t' + "read count" + '\t' + "fraction of total" + '\t' + "mapped read count" + '\t' + "fraction mapped of total" + '\n');
//               // TagsByTaxaByteHDF5TaxaGroups theTBT = new TagsByTaxaByteHDF5TaxaGroups(outputTBT);
//                for (String name : taxaNameToIndices.keySet()) {
//                    int t=taxaNameToIndices.get(name);
//                    int count = readsPerSample[t];
//                    int mappedCount = mappedReadsPerSample[t];
//                    float pct = ((float) count) / ((float) allReads);
//                    float mappedPct = ((float) mappedCount) / ((float) count);
//                    byte[] tagByteDist=getTagsDistribution(theMasterTags.getTagCount(),taxaReads[t]);
//                    if(TagsByTaxaByteHDF5TaxaGroups.addTaxon(outputTBT, name, tagByteDist)) {
//                        System.out.printf("Taxon %s written to %s %n", name, outputTBT);
//                    } else {
//                        System.err.printf("Error Writing Taxon %s written to %s %n", name, outputTBT);
//                    }
//                    report.writeBytes(name + '\t' + count + '\t' + pct + '\t' + mappedCount + '\t' + mappedPct + '\n');
//                }
//                report.close();
//             //   theTBT.getFileReadyForClosing();
//            } catch (Exception e) {
//                myLogger.warn("Caught exception while writing report file for lane " + laneNum + ": " + e);
//            }
        }
    }

    private synchronized boolean writeReport(String seqFile, String outputLog) {
        //Write report to a file named after the TBT file, with the extension ".log"
        float goodPct = ((float) goodBarcodedReads) / ((float) allReads);
        float goodMappedPct = ((float) goodMatched) / ((float) allReads);
        try {
            DataOutputStream report = new DataOutputStream(new FileOutputStream(outputLog, true));
            report.writeBytes(
                    "File: " + seqFile + '\n'
                    + "Total reads: " + allReads + '\n'
                    + "Accepted reads (with barcode and cut site): " + goodBarcodedReads + "(" + goodPct + " of total)" + '\n'
                    + "Accepted reads found in TOPM: " + goodMatched + "(" + goodMappedPct + " of total)" + '\n'
                    + "name" + '\t' + "read count" + '\t' + "fraction of total" + '\t' + "mapped read count" + '\t' + "fraction mapped of total" + '\n');
            // TagsByTaxaByteHDF5TaxaGroups theTBT = new TagsByTaxaByteHDF5TaxaGroups(outputTBT);
            for (String name : taxaNameToIndices.keySet()) {
                int t = taxaNameToIndices.get(name);
                int count = readsPerSample[t];
                int mappedCount = mappedReadsPerSample[t];
                float pct = ((float) count) / ((float) allReads);
                float mappedPct = ((float) mappedCount) / ((float) count);
                report.writeBytes(name + '\t' + count + '\t' + pct + '\t' + mappedCount + '\t' + mappedPct + '\n');
            }
            report.close();
            //   theTBT.getFileReadyForClosing();
        } catch (Exception e) {
            myLogger.warn("Caught exception while writing report file for file " + seqFile + ": " + e);
            return false;
        }
        return true;
    }

    private synchronized boolean writeTBT(String outputTBT) {
        TagsByTaxaByteHDF5TaxaGroups theTBT = new TagsByTaxaByteHDF5TaxaGroups(outputTBT);
        for (String name : taxaNameToIndices.keySet()) {
            int t = taxaNameToIndices.get(name);
            byte[] tagByteDist = getTagsDistribution(theTBT.getTagCount(), myTaxaReads[t]);
            theTBT.addTaxon(name, tagByteDist);
            System.out.printf("Taxon %s written to %s %n", name, outputTBT);
        }
        theTBT.getFileReadyForClosing();
        theTBT = null;
        try {
            System.out.println("Sleeping thread" + Thread.currentThread().getName());
            Thread.sleep(5000);
        } catch (InterruptedException ie) {
            System.err.println("Sleeping failed");
            return false;
        }
        return true;
    }

    private static String[] getNextSeq(BufferedReader br, boolean isFastQ) {
        String[] s = new String[2];
        try {
            if (isFastQ) {
                String temp = br.readLine();  //read sequence identifier
                if (temp == null) {
                    return null;
                }
                while (temp.charAt(0) != '@') {
                    temp = br.readLine();
                    if (temp == null) {
                        return null;
                    }
                }
                s[0] = br.readLine(); //read sequence
                br.readLine();  //read quality identifier
                s[1] = br.readLine();  //read quality
            } else {
                String temp = br.readLine();  //read sequence identifier
                if (temp == null) {
                    return null;
                }
                String[] jj = temp.split("\\s");
                s[0] = jj[8];
                s[1] = jj[9];
            }
        } catch (IOException e) {
            System.out.println("File closing");
            return null;
        }
        return s;
    }

    private static BufferedReader getBufferedReader(String fileName) {
        BufferedReader br = null;
        try {
            if (fileName.endsWith(".gz")) {
                br = new BufferedReader(new InputStreamReader(new MultiMemberGZIPInputStream(new FileInputStream(fileName))));
            } else {
                br = new BufferedReader(new FileReader(fileName), 65536);
            }
        } catch (IOException e) {
            System.out.println("Failed to open file:" + fileName);

        }
        return br;
    }

    private static byte[] getTagsDistribution(int tagNumber, IntArrayList tagDist) {
        byte[] result = new byte[tagNumber];
        for (int i : tagDist.elements()) {
            if (result[i] < Byte.MAX_VALUE) {
                result[i]++;
            }
        }
        return result;
    }

    private static ParseBarcodeRead initParseBarcodeRead(String keyFileS, String enzyme, File fastqFile) {
        ParseBarcodeRead thePBR = null;
        String[] np = fastqFile.getName().split("_");

        //Create a new object to hold barcoded tags.  The constructor can optionally process a group of fastq
        //files.  A minimum quality score for inclusion of a read can also be provided.
        if (np.length == 3) {
            thePBR = new ParseBarcodeRead(keyFileS, enzyme, np[0], np[1]);
        } else if (np.length == 5) {
            thePBR = new ParseBarcodeRead(keyFileS, enzyme, np[1], np[3]);
        } else if (np.length == 4) {
            thePBR = new ParseBarcodeRead(keyFileS, enzyme, np[0], np[2]);
        } else if (np.length == 6) {
            thePBR = new ParseBarcodeRead(keyFileS, enzyme, np[1], np[3]);
        } else {
            System.out.println("Error in parsing file name:");
            System.out.println("   The filename does not contain either 3 or 5 underscore-delimited values.");
            System.out.println("   Expect: flowcell_lane_fastq.txt OR code_flowcell_s_lane_fastq.txt");
            System.out.println("   Filename: " + fastqFile.getName());
            return null;
        }
        System.out.println("Total barcodes found in lane:" + thePBR.getBarCodeCount());
        if (thePBR.getBarCodeCount() == 0) {
            System.out.println("No barcodes found.  Skipping this flowcell.");
            return null;
        }
        return thePBR;
    }

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Seq to TBT HDF5";
    }

    @Override
    public String getToolTipText() {
        return "Seq to TBT HDF5";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(SeqToTBTHDF5Plugin.class);
    // }
    /**
     * Input directory containing .fastq files
     *
     * @return Input Directory
     */
    public String inputDirectory() {
        return myInputDir.value();
    }

    /**
     * Set Input Directory. Input directory containing .fastq files
     *
     * @param value Input Directory
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin inputDirectory(String value) {
        myInputDir = new PluginParameter<>(myInputDir, value);
        return this;
    }

    /**
     * Barcode key file
     *
     * @return Key File
     */
    public String keyFile() {
        return myKeyFile.value();
    }

    /**
     * Set Key File. Barcode key file
     *
     * @param value Key File
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin keyFile(String value) {
        myKeyFile = new PluginParameter<>(myKeyFile, value);
        return this;
    }

    /**
     * Enzyme used to create the GBS library, if it differs from the one listed
     * in the key file.
     *
     * @return Enzyme
     */
    public String enzyme() {
        return myEnzyme.value();
    }

    /**
     * Set Enzyme. Enzyme used to create the GBS library, if it differs from the
     * one listed in the key file.
     *
     * @param value Enzyme
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin enzyme(String value) {
        myEnzyme = new PluginParameter<>(myEnzyme, value);
        return this;
    }

    /**
     * Output HDF5 file
     *
     * @return Output File
     */
    public String outputFile() {
        return myOutputFile.value();
    }

    /**
     * Set Output File. Output HDF5 file
     *
     * @param value Output File
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin outputFile(String value) {
        myOutputFile = new PluginParameter<>(myOutputFile, value);
        return this;
    }

    /**
     * Max good reads per lane.
     *
     * @return Max Good Reads
     */
    public Integer maxGoodReads() {
        return myMaxGoodReads.value();
    }

    /**
     * Set Max Good Reads. Max good reads per lane.
     *
     * @param value Max Good Reads
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin maxGoodReads(Integer value) {
        myMaxGoodReads = new PluginParameter<>(myMaxGoodReads, value);
        return this;
    }

    /**
     * Output log file
     *
     * @return Log File
     */
    public String logFile() {
        return myLogFile.value();
    }

    /**
     * Set Log File. Output log file
     *
     * @param value Log File
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin logFile(String value) {
        myLogFile = new PluginParameter<>(myLogFile, value);
        return this;
    }

    /**
     * Tag count file. (Only -t or -m allowed)
     *
     * @return Tag Count File
     */
    public String tagCountFile() {
        return myTagCountFile.value();
    }

    /**
     * Set Tag Count File. Tag count file. (Only -t or -m allowed)
     *
     * @param value Tag Count File
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin tagCountFile(String value) {
        myTagCountFile = new PluginParameter<>(myTagCountFile, value);
        return this;
    }

    /**
     * Physical map file containing alignments. (Only -t or -m allowed)
     *
     * @return Physical Map File
     */
    public String physicalMapFile() {
        return myPhysicalMapFile.value();
    }

    /**
     * Set Physical Map File. Physical map file containing alignments. (Only -t
     * or -m allowed)
     *
     * @param value Physical Map File
     *
     * @return this plugin
     */
    public SeqToTBTHDF5Plugin physicalMapFile(String value) {
        myPhysicalMapFile = new PluginParameter<>(myPhysicalMapFile, value);
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy